import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import scipy as scp
import plotly.offline as pyo
import sklearn
from sklearn.preprocessing import KBinsDiscretizer
data_matches = pd.read_csv("./worldcup-1.1.0/data-csv/matches.csv")
data_stadiums = pd.read_csv("./worldcup-1.1.0/data-csv/stadiums.csv")
data_attendance = pd.read_csv("./attendance.csv")
data_matches.info()
data_matches.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 964 entries, 0 to 963 Data columns (total 37 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 964 non-null int64 1 tournament_id 964 non-null object 2 tournament_name 964 non-null object 3 match_id 964 non-null object 4 match_name 964 non-null object 5 stage_name 964 non-null object 6 group_name 964 non-null object 7 group_stage 964 non-null int64 8 knockout_stage 964 non-null int64 9 replayed 964 non-null int64 10 replay 964 non-null int64 11 match_date 964 non-null object 12 match_time 964 non-null object 13 stadium_id 964 non-null object 14 stadium_name 964 non-null object 15 city_name 964 non-null object 16 country_name 964 non-null object 17 home_team_id 964 non-null object 18 home_team_name 964 non-null object 19 home_team_code 964 non-null object 20 away_team_id 964 non-null object 21 away_team_name 964 non-null object 22 away_team_code 964 non-null object 23 score 964 non-null object 24 home_team_score 964 non-null int64 25 away_team_score 964 non-null int64 26 home_team_score_margin 964 non-null int64 27 away_team_score_margin 964 non-null int64 28 extra_time 964 non-null int64 29 penalty_shootout 964 non-null int64 30 score_penalties 964 non-null object 31 home_team_score_penalties 964 non-null int64 32 away_team_score_penalties 964 non-null int64 33 result 964 non-null object 34 home_team_win 964 non-null int64 35 away_team_win 964 non-null int64 36 draw 964 non-null int64 dtypes: int64(16), object(21) memory usage: 278.8+ KB
| key_id | tournament_id | tournament_name | match_id | match_name | stage_name | group_name | group_stage | knockout_stage | replayed | ... | away_team_score_margin | extra_time | penalty_shootout | score_penalties | home_team_score_penalties | away_team_score_penalties | result | home_team_win | away_team_win | draw | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | group stage | Group 1 | 1 | 0 | 0 | ... | -3 | 0 | 0 | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 |
| 1 | 2 | WC-1930 | 1930 FIFA World Cup | M-1930-02 | United States v Belgium | group stage | Group 4 | 1 | 0 | 0 | ... | -3 | 0 | 0 | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 |
| 2 | 3 | WC-1930 | 1930 FIFA World Cup | M-1930-03 | Yugoslavia v Brazil | group stage | Group 2 | 1 | 0 | 0 | ... | -1 | 0 | 0 | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 |
| 3 | 4 | WC-1930 | 1930 FIFA World Cup | M-1930-04 | Romania v Peru | group stage | Group 3 | 1 | 0 | 0 | ... | -2 | 0 | 0 | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 |
| 4 | 5 | WC-1930 | 1930 FIFA World Cup | M-1930-05 | Argentina v France | group stage | Group 1 | 1 | 0 | 0 | ... | -1 | 0 | 0 | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 |
5 rows × 37 columns
data_stadiums.info()
data_stadiums.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 193 entries, 0 to 192 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 193 non-null int64 1 stadium_id 193 non-null object 2 stadium_name 193 non-null object 3 city_name 193 non-null object 4 country_name 193 non-null object 5 stadium_capacity 193 non-null int64 6 stadium_wikipedia_link 193 non-null object 7 city_wikipedia_link 193 non-null object dtypes: int64(2), object(6) memory usage: 12.2+ KB
| key_id | stadium_id | stadium_name | city_name | country_name | stadium_capacity | stadium_wikipedia_link | city_wikipedia_link | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | S-001 | Estadio José Amalfitani | Buenos Aires | Argentina | 49000 | https://en.wikipedia.org/wiki/José_Amalfitani_... | https://en.wikipedia.org/wiki/Buenos_Aires |
| 1 | 2 | S-002 | Estadio Monumental | Buenos Aires | Argentina | 75000 | https://en.wikipedia.org/wiki/Estadio_Monument... | https://en.wikipedia.org/wiki/Buenos_Aires |
| 2 | 3 | S-003 | Estadio Chateau Carreras | Córdoba | Argentina | 47000 | https://en.wikipedia.org/wiki/Estadio_Mario_Al... | https://en.wikipedia.org/wiki/Córdoba,_Argentina |
| 3 | 4 | S-004 | Estadio José María Minella | Mar del Plata | Argentina | 44000 | https://en.wikipedia.org/wiki/Estadio_José_Mar... | https://en.wikipedia.org/wiki/Mar_del_Plata |
| 4 | 5 | S-005 | Estadio Ciudad de Mendoza | Mendoza | Argentina | 35000 | https://en.wikipedia.org/wiki/Estadio_Malvinas... | https://en.wikipedia.org/wiki/Mendoza,_Argentina |
data_attendance.info()
data_attendance.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 964 entries, 0 to 963 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 home_team 964 non-null object 1 away_team 964 non-null object 2 Attendance 964 non-null int64 3 Date 964 non-null object dtypes: int64(1), object(3) memory usage: 30.2+ KB
| home_team | away_team | Attendance | Date | |
|---|---|---|---|---|
| 0 | Argentina | France | 88966 | 2022-12-18 |
| 1 | Croatia | Morocco | 44137 | 2022-12-17 |
| 2 | France | Morocco | 68294 | 2022-12-14 |
| 3 | Argentina | Croatia | 88966 | 2022-12-13 |
| 4 | Morocco | Portugal | 44198 | 2022-12-10 |
print(data_matches.columns.values)
print(data_stadiums.columns.values)
print(data_attendance.columns.values)
['key_id' 'tournament_id' 'tournament_name' 'match_id' 'match_name' 'stage_name' 'group_name' 'group_stage' 'knockout_stage' 'replayed' 'replay' 'match_date' 'match_time' 'stadium_id' 'stadium_name' 'city_name' 'country_name' 'home_team_id' 'home_team_name' 'home_team_code' 'away_team_id' 'away_team_name' 'away_team_code' 'score' 'home_team_score' 'away_team_score' 'home_team_score_margin' 'away_team_score_margin' 'extra_time' 'penalty_shootout' 'score_penalties' 'home_team_score_penalties' 'away_team_score_penalties' 'result' 'home_team_win' 'away_team_win' 'draw'] ['key_id' 'stadium_id' 'stadium_name' 'city_name' 'country_name' 'stadium_capacity' 'stadium_wikipedia_link' 'city_wikipedia_link'] ['home_team' 'away_team' 'Attendance' 'Date']
s1 = set(data_attendance.home_team.value_counts().index) - \
set(data_matches.home_team_name.value_counts().index)
s1
s2 = set(data_matches.home_team_name.value_counts().index) - \
set(data_attendance.home_team.value_counts().index)
s2
print(f"the difference between data_attendance and data_matches is: \n \t{s1}")
print(f"the difference between data_matches and data_attendance is: \n \t{s2}")
the difference between data_attendance and data_matches is:
{'FR Yugoslavia', 'Korea DPR', "Côte d'Ivoire", 'Germany DR', 'IR Iran', 'China PR', 'Korea Republic', 'Türkiye'}
the difference between data_matches and data_attendance is:
{'Turkey', 'Iran', 'East Germany', 'Ivory Coast', 'South Korea', 'China', 'North Korea'}
replace:
- China PR -> China,
- Côte d'Ivoire -> Ivory Coast,
- Germany DR -> East Germany,
- IR Iran -> Iran,
- Korea DPR -> North Korea,
- Korea Republic -> South Korea,
- Türkiye -> Turkey
- FR Yugoslavia -> Yugoslavia
replace_team_name_map = {'China PR': "China",
"Côte d'Ivoire": "Ivory Coast",
'Germany DR': "East Germany",
'IR Iran': "Iran",
'Korea DPR': "North Korea",
'Korea Republic': "South Korea",
'Türkiye': "Turkey",
"FR Yugoslavia": "Yugoslavia"}
data_attendance = data_attendance.replace(
{"home_team": replace_team_name_map, "away_team": replace_team_name_map})
data_attendance.head()
data_attendance.home_team.value_counts().index
data_attendance.away_team.value_counts().index
Index(['Mexico', 'France', 'England', 'Spain', 'Brazil', 'Argentina',
'Belgium', 'Italy', 'Uruguay', 'Switzerland', 'Netherlands',
'United States', 'Sweden', 'Croatia', 'Czechoslovakia', 'South Korea',
'Yugoslavia', 'Bulgaria', 'Chile', 'West Germany', 'Poland', 'Austria',
'Paraguay', 'Germany', 'Morocco', 'Portugal', 'Hungary', 'Scotland',
'Colombia', 'Denmark', 'Costa Rica', 'Iran', 'Cameroon', 'Soviet Union',
'Australia', 'Romania', 'Peru', 'Tunisia', 'Saudi Arabia', 'Japan',
'Nigeria', 'Ghana', 'Northern Ireland', 'Turkey', 'Republic of Ireland',
'Russia', 'Senegal', 'Ecuador', 'Algeria', 'Norway', 'Greece',
'Ivory Coast', 'El Salvador', 'Egypt', 'Serbia', 'New Zealand',
'Bolivia', 'Wales', 'North Korea', 'Ukraine', 'Honduras', 'Canada',
'Kuwait', 'Slovenia', 'South Africa', 'Israel', 'East Germany',
'Jamaica', 'China', 'Trinidad and Tobago', 'Serbia and Montenegro',
'United Arab Emirates', 'Angola', 'Haiti', 'Slovakia',
'Bosnia and Herzegovina', 'Iceland', 'Panama', 'Cuba', 'Zaire', 'Qatar',
'Iraq', 'Czech Republic', 'Togo', 'Dutch East Indies'],
dtype='object', name='away_team')
s1 = set(data_attendance.home_team.value_counts().index) - \
set(data_matches.home_team_name.value_counts().index)
s1
s2 = set(data_matches.home_team_name.value_counts().index) - \
set(data_attendance.home_team.value_counts().index)
s2
print(f"the difference between data_attendance and data_matches is: \n \t{s1}")
print(f"the difference between data_matches and data_attendance is: \n \t{s2}")
the difference between data_attendance and data_matches is: set() the difference between data_matches and data_attendance is: set()
isnan_attendance = data_attendance.isnull().sum().sum()
isnan_stadiums = data_stadiums.isnull().sum().sum()
isnan_matches = data_matches.isnull().sum().sum()
print(f"the counts of nan in attendance data is {isnan_attendance}")
print(f"the counts of nan in stadiums data is {isnan_stadiums}")
print(f"the counts of nan in matches data is {isnan_matches}")
the counts of nan in attendance data is 0 the counts of nan in stadiums data is 0 the counts of nan in matches data is 0
merge_matches = pd.merge(data_matches, data_stadiums,
on="stadium_id", how="inner", suffixes=('', '_y')).filter(regex='^(?!.*_y)')
merge_matches.head()
| key_id | tournament_id | tournament_name | match_id | match_name | stage_name | group_name | group_stage | knockout_stage | replayed | ... | score_penalties | home_team_score_penalties | away_team_score_penalties | result | home_team_win | away_team_win | draw | stadium_capacity | stadium_wikipedia_link | city_wikipedia_link | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | group stage | Group 1 | 1 | 0 | 0 | ... | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 | 10000 | https://en.wikipedia.org/wiki/Estadio_Pocitos | https://en.wikipedia.org/wiki/Montevideo |
| 1 | 4 | WC-1930 | 1930 FIFA World Cup | M-1930-04 | Romania v Peru | group stage | Group 3 | 1 | 0 | 0 | ... | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 | 10000 | https://en.wikipedia.org/wiki/Estadio_Pocitos | https://en.wikipedia.org/wiki/Montevideo |
| 2 | 2 | WC-1930 | 1930 FIFA World Cup | M-1930-02 | United States v Belgium | group stage | Group 4 | 1 | 0 | 0 | ... | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 | 20000 | https://en.wikipedia.org/wiki/Estadio_Gran_Par... | https://en.wikipedia.org/wiki/Montevideo |
| 3 | 3 | WC-1930 | 1930 FIFA World Cup | M-1930-03 | Yugoslavia v Brazil | group stage | Group 2 | 1 | 0 | 0 | ... | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 | 20000 | https://en.wikipedia.org/wiki/Estadio_Gran_Par... | https://en.wikipedia.org/wiki/Montevideo |
| 4 | 5 | WC-1930 | 1930 FIFA World Cup | M-1930-05 | Argentina v France | group stage | Group 1 | 1 | 0 | 0 | ... | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 | 20000 | https://en.wikipedia.org/wiki/Estadio_Gran_Par... | https://en.wikipedia.org/wiki/Montevideo |
5 rows × 40 columns
merge_matches.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 964 entries, 0 to 963 Data columns (total 40 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 964 non-null int64 1 tournament_id 964 non-null object 2 tournament_name 964 non-null object 3 match_id 964 non-null object 4 match_name 964 non-null object 5 stage_name 964 non-null object 6 group_name 964 non-null object 7 group_stage 964 non-null int64 8 knockout_stage 964 non-null int64 9 replayed 964 non-null int64 10 replay 964 non-null int64 11 match_date 964 non-null object 12 match_time 964 non-null object 13 stadium_id 964 non-null object 14 stadium_name 964 non-null object 15 city_name 964 non-null object 16 country_name 964 non-null object 17 home_team_id 964 non-null object 18 home_team_name 964 non-null object 19 home_team_code 964 non-null object 20 away_team_id 964 non-null object 21 away_team_name 964 non-null object 22 away_team_code 964 non-null object 23 score 964 non-null object 24 home_team_score 964 non-null int64 25 away_team_score 964 non-null int64 26 home_team_score_margin 964 non-null int64 27 away_team_score_margin 964 non-null int64 28 extra_time 964 non-null int64 29 penalty_shootout 964 non-null int64 30 score_penalties 964 non-null object 31 home_team_score_penalties 964 non-null int64 32 away_team_score_penalties 964 non-null int64 33 result 964 non-null object 34 home_team_win 964 non-null int64 35 away_team_win 964 non-null int64 36 draw 964 non-null int64 37 stadium_capacity 964 non-null int64 38 stadium_wikipedia_link 964 non-null object 39 city_wikipedia_link 964 non-null object dtypes: int64(17), object(23) memory usage: 301.4+ KB
merge_matches.columns.values
array(['key_id', 'tournament_id', 'tournament_name', 'match_id',
'match_name', 'stage_name', 'group_name', 'group_stage',
'knockout_stage', 'replayed', 'replay', 'match_date', 'match_time',
'stadium_id', 'stadium_name', 'city_name', 'country_name',
'home_team_id', 'home_team_name', 'home_team_code', 'away_team_id',
'away_team_name', 'away_team_code', 'score', 'home_team_score',
'away_team_score', 'home_team_score_margin',
'away_team_score_margin', 'extra_time', 'penalty_shootout',
'score_penalties', 'home_team_score_penalties',
'away_team_score_penalties', 'result', 'home_team_win',
'away_team_win', 'draw', 'stadium_capacity',
'stadium_wikipedia_link', 'city_wikipedia_link'], dtype=object)
merge_matches.isnull().sum(axis=0).sum()
0
merge_matches_attendance1 = pd.merge(merge_matches, data_attendance, how='inner',
left_on=['home_team_name',
'away_team_name', 'match_date'],
right_on=['home_team', 'away_team', 'Date'])
merge_matches_attendance1.columns.values
array(['key_id', 'tournament_id', 'tournament_name', 'match_id',
'match_name', 'stage_name', 'group_name', 'group_stage',
'knockout_stage', 'replayed', 'replay', 'match_date', 'match_time',
'stadium_id', 'stadium_name', 'city_name', 'country_name',
'home_team_id', 'home_team_name', 'home_team_code', 'away_team_id',
'away_team_name', 'away_team_code', 'score', 'home_team_score',
'away_team_score', 'home_team_score_margin',
'away_team_score_margin', 'extra_time', 'penalty_shootout',
'score_penalties', 'home_team_score_penalties',
'away_team_score_penalties', 'result', 'home_team_win',
'away_team_win', 'draw', 'stadium_capacity',
'stadium_wikipedia_link', 'city_wikipedia_link', 'home_team',
'away_team', 'Attendance', 'Date'], dtype=object)
merge_matches_attendance1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 937 entries, 0 to 936 Data columns (total 44 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 937 non-null int64 1 tournament_id 937 non-null object 2 tournament_name 937 non-null object 3 match_id 937 non-null object 4 match_name 937 non-null object 5 stage_name 937 non-null object 6 group_name 937 non-null object 7 group_stage 937 non-null int64 8 knockout_stage 937 non-null int64 9 replayed 937 non-null int64 10 replay 937 non-null int64 11 match_date 937 non-null object 12 match_time 937 non-null object 13 stadium_id 937 non-null object 14 stadium_name 937 non-null object 15 city_name 937 non-null object 16 country_name 937 non-null object 17 home_team_id 937 non-null object 18 home_team_name 937 non-null object 19 home_team_code 937 non-null object 20 away_team_id 937 non-null object 21 away_team_name 937 non-null object 22 away_team_code 937 non-null object 23 score 937 non-null object 24 home_team_score 937 non-null int64 25 away_team_score 937 non-null int64 26 home_team_score_margin 937 non-null int64 27 away_team_score_margin 937 non-null int64 28 extra_time 937 non-null int64 29 penalty_shootout 937 non-null int64 30 score_penalties 937 non-null object 31 home_team_score_penalties 937 non-null int64 32 away_team_score_penalties 937 non-null int64 33 result 937 non-null object 34 home_team_win 937 non-null int64 35 away_team_win 937 non-null int64 36 draw 937 non-null int64 37 stadium_capacity 937 non-null int64 38 stadium_wikipedia_link 937 non-null object 39 city_wikipedia_link 937 non-null object 40 home_team 937 non-null object 41 away_team 937 non-null object 42 Attendance 937 non-null int64 43 Date 937 non-null object dtypes: int64(18), object(26) memory usage: 322.2+ KB
merge_matches_attendance2 = pd.merge(merge_matches, data_attendance, how='inner',
left_on=['home_team_name',
'away_team_name', 'match_date'],
right_on=['away_team', 'home_team', 'Date'])
merge_matches_attendance2.columns.values
merge_matches_attendance2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 21 entries, 0 to 20 Data columns (total 44 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 21 non-null int64 1 tournament_id 21 non-null object 2 tournament_name 21 non-null object 3 match_id 21 non-null object 4 match_name 21 non-null object 5 stage_name 21 non-null object 6 group_name 21 non-null object 7 group_stage 21 non-null int64 8 knockout_stage 21 non-null int64 9 replayed 21 non-null int64 10 replay 21 non-null int64 11 match_date 21 non-null object 12 match_time 21 non-null object 13 stadium_id 21 non-null object 14 stadium_name 21 non-null object 15 city_name 21 non-null object 16 country_name 21 non-null object 17 home_team_id 21 non-null object 18 home_team_name 21 non-null object 19 home_team_code 21 non-null object 20 away_team_id 21 non-null object 21 away_team_name 21 non-null object 22 away_team_code 21 non-null object 23 score 21 non-null object 24 home_team_score 21 non-null int64 25 away_team_score 21 non-null int64 26 home_team_score_margin 21 non-null int64 27 away_team_score_margin 21 non-null int64 28 extra_time 21 non-null int64 29 penalty_shootout 21 non-null int64 30 score_penalties 21 non-null object 31 home_team_score_penalties 21 non-null int64 32 away_team_score_penalties 21 non-null int64 33 result 21 non-null object 34 home_team_win 21 non-null int64 35 away_team_win 21 non-null int64 36 draw 21 non-null int64 37 stadium_capacity 21 non-null int64 38 stadium_wikipedia_link 21 non-null object 39 city_wikipedia_link 21 non-null object 40 home_team 21 non-null object 41 away_team 21 non-null object 42 Attendance 21 non-null int64 43 Date 21 non-null object dtypes: int64(18), object(26) memory usage: 7.3+ KB
merge_matches_attendance = pd.concat(
[merge_matches_attendance2, merge_matches_attendance1], ignore_index=True, sort=False)
merge_matches_attendance.info()
merge_matches_attendance.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 958 entries, 0 to 957 Data columns (total 44 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 958 non-null int64 1 tournament_id 958 non-null object 2 tournament_name 958 non-null object 3 match_id 958 non-null object 4 match_name 958 non-null object 5 stage_name 958 non-null object 6 group_name 958 non-null object 7 group_stage 958 non-null int64 8 knockout_stage 958 non-null int64 9 replayed 958 non-null int64 10 replay 958 non-null int64 11 match_date 958 non-null object 12 match_time 958 non-null object 13 stadium_id 958 non-null object 14 stadium_name 958 non-null object 15 city_name 958 non-null object 16 country_name 958 non-null object 17 home_team_id 958 non-null object 18 home_team_name 958 non-null object 19 home_team_code 958 non-null object 20 away_team_id 958 non-null object 21 away_team_name 958 non-null object 22 away_team_code 958 non-null object 23 score 958 non-null object 24 home_team_score 958 non-null int64 25 away_team_score 958 non-null int64 26 home_team_score_margin 958 non-null int64 27 away_team_score_margin 958 non-null int64 28 extra_time 958 non-null int64 29 penalty_shootout 958 non-null int64 30 score_penalties 958 non-null object 31 home_team_score_penalties 958 non-null int64 32 away_team_score_penalties 958 non-null int64 33 result 958 non-null object 34 home_team_win 958 non-null int64 35 away_team_win 958 non-null int64 36 draw 958 non-null int64 37 stadium_capacity 958 non-null int64 38 stadium_wikipedia_link 958 non-null object 39 city_wikipedia_link 958 non-null object 40 home_team 958 non-null object 41 away_team 958 non-null object 42 Attendance 958 non-null int64 43 Date 958 non-null object dtypes: int64(18), object(26) memory usage: 329.4+ KB
| key_id | tournament_id | tournament_name | match_id | match_name | stage_name | group_name | group_stage | knockout_stage | replayed | ... | home_team_win | away_team_win | draw | stadium_capacity | stadium_wikipedia_link | city_wikipedia_link | home_team | away_team | Attendance | Date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 459 | WC-1990 | 1990 FIFA World Cup | M-1990-47 | Czechoslovakia v West Germany | quarter-finals | not applicable | 0 | 1 | 0 | ... | 0 | 1 | 0 | 75000 | https://en.wikipedia.org/wiki/San_Siro | https://en.wikipedia.org/wiki/Milan | West Germany | Czechoslovakia | 73347 | 1990-07-01 |
| 1 | 94 | WC-1954 | 1954 FIFA World Cup | M-1954-19 | Austria v Switzerland | quarter-finals | not applicable | 0 | 1 | 0 | ... | 1 | 0 | 0 | 50000 | https://en.wikipedia.org/wiki/Stade_Olympique_... | https://en.wikipedia.org/wiki/Lausanne | Switzerland | Austria | 35000 | 1954-06-26 |
| 2 | 91 | WC-1954 | 1954 FIFA World Cup | M-1954-16 | England v Switzerland | group stage | Group 4 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 65000 | https://en.wikipedia.org/wiki/Wankdorf_Stadium | https://en.wikipedia.org/wiki/Bern | Switzerland | England | 43500 | 1954-06-20 |
| 3 | 185 | WC-1966 | 1966 FIFA World Cup | M-1966-17 | Mexico v Uruguay | group stage | Group 1 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 99000 | https://en.wikipedia.org/wiki/Wembley_Stadium_... | https://en.wikipedia.org/wiki/London | Uruguay | Mexico | 61112 | 1966-07-19 |
| 4 | 180 | WC-1966 | 1966 FIFA World Cup | M-1966-12 | Chile v North Korea | group stage | Group 4 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 40000 | https://en.wikipedia.org/wiki/Ayresome_Park | https://en.wikipedia.org/wiki/Middlesbrough | North Korea | Chile | 13792 | 1966-07-15 |
5 rows × 44 columns
ids = merge_matches_attendance.match_id.values
rows_out_matches = merge_matches[~merge_matches.match_id.isin(ids)]
dates = rows_out_matches.match_date.values
rows_out_matches[["home_team_name", "away_team_name"]]
| home_team_name | away_team_name | |
|---|---|---|
| 118 | West Germany | Yugoslavia |
| 127 | West Germany | Turkey |
| 130 | West Germany | Turkey |
| 133 | West Germany | Hungary |
| 136 | Hungary | West Germany |
| 139 | West Germany | Austria |
fix_attendance = data_attendance[(data_attendance.Date.isin(dates)) & (
data_attendance.home_team.isin(["Germany"])
| data_attendance.away_team.isin(["Germany"]))]
fix_attendance
| home_team | away_team | Attendance | Date | |
|---|---|---|---|---|
| 863 | Germany | Hungary | 62500 | 1954-07-04 |
| 866 | Germany | Austria | 58000 | 1954-06-30 |
| 867 | Germany | Yugoslavia | 17000 | 1954-06-27 |
| 871 | Germany | Turkey | 17000 | 1954-06-23 |
| 873 | Hungary | Germany | 56000 | 1954-06-20 |
| 883 | Germany | Turkey | 28000 | 1954-06-17 |
fix_team_name_map = {
'Germany': "West Germany"
}
fix_attendance = fix_attendance.replace(
{"home_team": fix_team_name_map, "away_team": fix_team_name_map})
fix_attendance
| home_team | away_team | Attendance | Date | |
|---|---|---|---|---|
| 863 | West Germany | Hungary | 62500 | 1954-07-04 |
| 866 | West Germany | Austria | 58000 | 1954-06-30 |
| 867 | West Germany | Yugoslavia | 17000 | 1954-06-27 |
| 871 | West Germany | Turkey | 17000 | 1954-06-23 |
| 873 | Hungary | West Germany | 56000 | 1954-06-20 |
| 883 | West Germany | Turkey | 28000 | 1954-06-17 |
merge_out_matches = pd.merge(rows_out_matches, fix_attendance, how='inner',
left_on=['away_team_name',
'home_team_name', 'match_date'],
right_on=['away_team', 'home_team', 'Date'])
merge_out_matches
| key_id | tournament_id | tournament_name | match_id | match_name | stage_name | group_name | group_stage | knockout_stage | replayed | ... | home_team_win | away_team_win | draw | stadium_capacity | stadium_wikipedia_link | city_wikipedia_link | home_team | away_team | Attendance | Date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 97 | WC-1954 | 1954 FIFA World Cup | M-1954-22 | West Germany v Yugoslavia | quarter-finals | not applicable | 0 | 1 | 0 | ... | 1 | 0 | 0 | 36000 | https://en.wikipedia.org/wiki/Charmilles_Stadium | https://en.wikipedia.org/wiki/Geneva | West Germany | Yugoslavia | 17000 | 1954-06-27 |
| 1 | 92 | WC-1954 | 1954 FIFA World Cup | M-1954-17 | West Germany v Turkey | group stage | Group 2 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 35000 | https://en.wikipedia.org/wiki/Hardturm | https://en.wikipedia.org/wiki/Zürich | West Germany | Turkey | 17000 | 1954-06-23 |
| 2 | 82 | WC-1954 | 1954 FIFA World Cup | M-1954-07 | West Germany v Turkey | group stage | Group 2 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 65000 | https://en.wikipedia.org/wiki/Wankdorf_Stadium | https://en.wikipedia.org/wiki/Bern | West Germany | Turkey | 28000 | 1954-06-17 |
| 3 | 101 | WC-1954 | 1954 FIFA World Cup | M-1954-26 | West Germany v Hungary | final | not applicable | 0 | 1 | 0 | ... | 1 | 0 | 0 | 65000 | https://en.wikipedia.org/wiki/Wankdorf_Stadium | https://en.wikipedia.org/wiki/Bern | West Germany | Hungary | 62500 | 1954-07-04 |
| 4 | 88 | WC-1954 | 1954 FIFA World Cup | M-1954-13 | Hungary v West Germany | group stage | Group 2 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 55000 | https://en.wikipedia.org/wiki/St._Jakob_Stadium | https://en.wikipedia.org/wiki/Basel | Hungary | West Germany | 56000 | 1954-06-20 |
| 5 | 99 | WC-1954 | 1954 FIFA World Cup | M-1954-24 | West Germany v Austria | semi-finals | not applicable | 0 | 1 | 0 | ... | 1 | 0 | 0 | 55000 | https://en.wikipedia.org/wiki/St._Jakob_Stadium | https://en.wikipedia.org/wiki/Basel | West Germany | Austria | 58000 | 1954-06-30 |
6 rows × 44 columns
final_matches_attendance = pd.concat(
[merge_out_matches, merge_matches_attendance], ignore_index=True, sort=False)
final_matches_attendance.head()
final_matches_attendance.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 964 entries, 0 to 963 Data columns (total 44 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 964 non-null int64 1 tournament_id 964 non-null object 2 tournament_name 964 non-null object 3 match_id 964 non-null object 4 match_name 964 non-null object 5 stage_name 964 non-null object 6 group_name 964 non-null object 7 group_stage 964 non-null int64 8 knockout_stage 964 non-null int64 9 replayed 964 non-null int64 10 replay 964 non-null int64 11 match_date 964 non-null object 12 match_time 964 non-null object 13 stadium_id 964 non-null object 14 stadium_name 964 non-null object 15 city_name 964 non-null object 16 country_name 964 non-null object 17 home_team_id 964 non-null object 18 home_team_name 964 non-null object 19 home_team_code 964 non-null object 20 away_team_id 964 non-null object 21 away_team_name 964 non-null object 22 away_team_code 964 non-null object 23 score 964 non-null object 24 home_team_score 964 non-null int64 25 away_team_score 964 non-null int64 26 home_team_score_margin 964 non-null int64 27 away_team_score_margin 964 non-null int64 28 extra_time 964 non-null int64 29 penalty_shootout 964 non-null int64 30 score_penalties 964 non-null object 31 home_team_score_penalties 964 non-null int64 32 away_team_score_penalties 964 non-null int64 33 result 964 non-null object 34 home_team_win 964 non-null int64 35 away_team_win 964 non-null int64 36 draw 964 non-null int64 37 stadium_capacity 964 non-null int64 38 stadium_wikipedia_link 964 non-null object 39 city_wikipedia_link 964 non-null object 40 home_team 964 non-null object 41 away_team 964 non-null object 42 Attendance 964 non-null int64 43 Date 964 non-null object dtypes: int64(18), object(26) memory usage: 331.5+ KB
final_matches_attendance.head()
| key_id | tournament_id | tournament_name | match_id | match_name | stage_name | group_name | group_stage | knockout_stage | replayed | ... | home_team_win | away_team_win | draw | stadium_capacity | stadium_wikipedia_link | city_wikipedia_link | home_team | away_team | Attendance | Date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 97 | WC-1954 | 1954 FIFA World Cup | M-1954-22 | West Germany v Yugoslavia | quarter-finals | not applicable | 0 | 1 | 0 | ... | 1 | 0 | 0 | 36000 | https://en.wikipedia.org/wiki/Charmilles_Stadium | https://en.wikipedia.org/wiki/Geneva | West Germany | Yugoslavia | 17000 | 1954-06-27 |
| 1 | 92 | WC-1954 | 1954 FIFA World Cup | M-1954-17 | West Germany v Turkey | group stage | Group 2 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 35000 | https://en.wikipedia.org/wiki/Hardturm | https://en.wikipedia.org/wiki/Zürich | West Germany | Turkey | 17000 | 1954-06-23 |
| 2 | 82 | WC-1954 | 1954 FIFA World Cup | M-1954-07 | West Germany v Turkey | group stage | Group 2 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 65000 | https://en.wikipedia.org/wiki/Wankdorf_Stadium | https://en.wikipedia.org/wiki/Bern | West Germany | Turkey | 28000 | 1954-06-17 |
| 3 | 101 | WC-1954 | 1954 FIFA World Cup | M-1954-26 | West Germany v Hungary | final | not applicable | 0 | 1 | 0 | ... | 1 | 0 | 0 | 65000 | https://en.wikipedia.org/wiki/Wankdorf_Stadium | https://en.wikipedia.org/wiki/Bern | West Germany | Hungary | 62500 | 1954-07-04 |
| 4 | 88 | WC-1954 | 1954 FIFA World Cup | M-1954-13 | Hungary v West Germany | group stage | Group 2 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 55000 | https://en.wikipedia.org/wiki/St._Jakob_Stadium | https://en.wikipedia.org/wiki/Basel | Hungary | West Germany | 56000 | 1954-06-20 |
5 rows × 44 columns
merge_matches_attendance.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 958 entries, 0 to 957 Data columns (total 44 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 958 non-null int64 1 tournament_id 958 non-null object 2 tournament_name 958 non-null object 3 match_id 958 non-null object 4 match_name 958 non-null object 5 stage_name 958 non-null object 6 group_name 958 non-null object 7 group_stage 958 non-null int64 8 knockout_stage 958 non-null int64 9 replayed 958 non-null int64 10 replay 958 non-null int64 11 match_date 958 non-null object 12 match_time 958 non-null object 13 stadium_id 958 non-null object 14 stadium_name 958 non-null object 15 city_name 958 non-null object 16 country_name 958 non-null object 17 home_team_id 958 non-null object 18 home_team_name 958 non-null object 19 home_team_code 958 non-null object 20 away_team_id 958 non-null object 21 away_team_name 958 non-null object 22 away_team_code 958 non-null object 23 score 958 non-null object 24 home_team_score 958 non-null int64 25 away_team_score 958 non-null int64 26 home_team_score_margin 958 non-null int64 27 away_team_score_margin 958 non-null int64 28 extra_time 958 non-null int64 29 penalty_shootout 958 non-null int64 30 score_penalties 958 non-null object 31 home_team_score_penalties 958 non-null int64 32 away_team_score_penalties 958 non-null int64 33 result 958 non-null object 34 home_team_win 958 non-null int64 35 away_team_win 958 non-null int64 36 draw 958 non-null int64 37 stadium_capacity 958 non-null int64 38 stadium_wikipedia_link 958 non-null object 39 city_wikipedia_link 958 non-null object 40 home_team 958 non-null object 41 away_team 958 non-null object 42 Attendance 958 non-null int64 43 Date 958 non-null object dtypes: int64(18), object(26) memory usage: 329.4+ KB
final_matches_attendance.isnull().sum(axis=0).sum()
0
final_matches_attendance[['match_name', 'match_date', 'stadium_id',
'stadium_capacity', 'home_team', 'away_team', 'Attendance', 'Date']].head()
| match_name | match_date | stadium_id | stadium_capacity | home_team | away_team | Attendance | Date | |
|---|---|---|---|---|---|---|---|---|
| 0 | West Germany v Yugoslavia | 1954-06-27 | S-178 | 36000 | West Germany | Yugoslavia | 17000 | 1954-06-27 |
| 1 | West Germany v Turkey | 1954-06-23 | S-181 | 35000 | West Germany | Turkey | 17000 | 1954-06-23 |
| 2 | West Germany v Turkey | 1954-06-17 | S-177 | 65000 | West Germany | Turkey | 28000 | 1954-06-17 |
| 3 | West Germany v Hungary | 1954-07-04 | S-177 | 65000 | West Germany | Hungary | 62500 | 1954-07-04 |
| 4 | Hungary v West Germany | 1954-06-20 | S-176 | 55000 | Hungary | West Germany | 56000 | 1954-06-20 |
final_matches_attendance = final_matches_attendance.drop(
['home_team', 'away_team'], axis=1)
final_matches_attendance.head()
| key_id | tournament_id | tournament_name | match_id | match_name | stage_name | group_name | group_stage | knockout_stage | replayed | ... | away_team_score_penalties | result | home_team_win | away_team_win | draw | stadium_capacity | stadium_wikipedia_link | city_wikipedia_link | Attendance | Date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 97 | WC-1954 | 1954 FIFA World Cup | M-1954-22 | West Germany v Yugoslavia | quarter-finals | not applicable | 0 | 1 | 0 | ... | 0 | home team win | 1 | 0 | 0 | 36000 | https://en.wikipedia.org/wiki/Charmilles_Stadium | https://en.wikipedia.org/wiki/Geneva | 17000 | 1954-06-27 |
| 1 | 92 | WC-1954 | 1954 FIFA World Cup | M-1954-17 | West Germany v Turkey | group stage | Group 2 | 1 | 0 | 0 | ... | 0 | home team win | 1 | 0 | 0 | 35000 | https://en.wikipedia.org/wiki/Hardturm | https://en.wikipedia.org/wiki/Zürich | 17000 | 1954-06-23 |
| 2 | 82 | WC-1954 | 1954 FIFA World Cup | M-1954-07 | West Germany v Turkey | group stage | Group 2 | 1 | 0 | 0 | ... | 0 | home team win | 1 | 0 | 0 | 65000 | https://en.wikipedia.org/wiki/Wankdorf_Stadium | https://en.wikipedia.org/wiki/Bern | 28000 | 1954-06-17 |
| 3 | 101 | WC-1954 | 1954 FIFA World Cup | M-1954-26 | West Germany v Hungary | final | not applicable | 0 | 1 | 0 | ... | 0 | home team win | 1 | 0 | 0 | 65000 | https://en.wikipedia.org/wiki/Wankdorf_Stadium | https://en.wikipedia.org/wiki/Bern | 62500 | 1954-07-04 |
| 4 | 88 | WC-1954 | 1954 FIFA World Cup | M-1954-13 | Hungary v West Germany | group stage | Group 2 | 1 | 0 | 0 | ... | 0 | home team win | 1 | 0 | 0 | 55000 | https://en.wikipedia.org/wiki/St._Jakob_Stadium | https://en.wikipedia.org/wiki/Basel | 56000 | 1954-06-20 |
5 rows × 42 columns
data_players = pd.read_csv("./worldcup-1.1.0/data-csv/players.csv")
data_players.info()
data_players.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8485 entries, 0 to 8484 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 8485 non-null int64 1 player_id 8485 non-null object 2 family_name 8485 non-null object 3 given_name 8485 non-null object 4 birth_date 8484 non-null object 5 goal_keeper 8485 non-null int64 6 defender 8485 non-null int64 7 midfielder 8485 non-null int64 8 forward 8485 non-null int64 9 count_tournaments 8485 non-null int64 10 list_tournaments 8485 non-null object 11 player_wikipedia_link 8485 non-null object dtypes: int64(6), object(6) memory usage: 795.6+ KB
| key_id | player_id | family_name | given_name | birth_date | goal_keeper | defender | midfielder | forward | count_tournaments | list_tournaments | player_wikipedia_link | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | P-08891 | A'Court | Alan | 1934-09-30 | 0 | 0 | 0 | 1 | 1 | 1958 | https://en.wikipedia.org/wiki/Alan_A%27Court |
| 1 | 2 | P-08589 | Aaronson | Brenden | 2000-10-22 | 0 | 0 | 0 | 1 | 1 | 2022 | https://en.wikipedia.org/wiki/Brenden_Aaronson |
| 2 | 3 | P-04897 | Abadzhiev | Stefan | 1934-07-03 | 0 | 0 | 0 | 1 | 1 | 1966 | https://en.wikipedia.org/wiki/Stefan_Abadzhiev |
| 3 | 4 | P-05556 | Abalo | Jean-Paul | 1975-06-26 | 0 | 1 | 0 | 0 | 1 | 2006 | https://en.wikipedia.org/wiki/Jean-Paul_Abalo |
| 4 | 5 | P-08163 | Abanda | Patrice | 1978-08-03 | 0 | 1 | 0 | 0 | 1 | 1998 | https://en.wikipedia.org/wiki/Patrice_Abanda |
data_players.isnull().sum(axis=0)
key_id 0 player_id 0 family_name 0 given_name 0 birth_date 1 goal_keeper 0 defender 0 midfielder 0 forward 0 count_tournaments 0 list_tournaments 0 player_wikipedia_link 0 dtype: int64
data_players.sort_values(["count_tournaments"], ascending=False)
| key_id | player_id | family_name | given_name | birth_date | goal_keeper | defender | midfielder | forward | count_tournaments | list_tournaments | player_wikipedia_link | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1235 | 1236 | P-00677 | Buffon | Gianluigi | 1978-01-28 | 1 | 0 | 0 | 0 | 5 | 1998, 2002, 2006, 2010, 2014 | https://en.wikipedia.org/wiki/Gianluigi_Buffon |
| 4968 | 4969 | P-09502 | Matthäus | Lothar | 1961-03-21 | 0 | 1 | 1 | 0 | 5 | 1982, 1986, 1990, 1994, 1998 | https://en.wikipedia.org/wiki/Lothar_Matth%C3%... |
| 6690 | 6691 | P-03142 | Ronaldo | Cristiano | 1985-02-05 | 0 | 0 | 0 | 1 | 5 | 2006, 2010, 2014, 2018, 2022 | https://en.wikipedia.org/wiki/Cristiano_Ronaldo |
| 1408 | 1409 | P-08981 | Carbajal | Antonio | 1929-06-07 | 1 | 0 | 0 | 0 | 5 | 1950, 1954, 1958, 1962, 1966 | https://en.wikipedia.org/wiki/Antonio_Carbajal |
| 5724 | 5725 | P-08954 | Ochoa | Guillermo | 1985-07-13 | 1 | 0 | 0 | 0 | 5 | 2006, 2010, 2014, 2018, 2022 | https://en.wikipedia.org/wiki/Guillermo_Ochoa |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3190 | 3191 | P-08662 | Habermann | Sven | 1961-11-03 | 1 | 0 | 0 | 0 | 1 | 1986 | https://en.wikipedia.org/wiki/Sven_Habermann |
| 3189 | 3190 | P-04091 | Hababi | El Arbi Hababi | 1967-08-12 | 0 | 0 | 1 | 0 | 1 | 1994 | https://en.wikipedia.org/wiki/El_Arbi_Hababi |
| 3188 | 3189 | P-08920 | Haas | Mario | 1974-09-16 | 0 | 0 | 0 | 1 | 1 | 1998 | https://en.wikipedia.org/wiki/Mario_Haas |
| 3185 | 3186 | P-01120 | Ha | Jung-won | 1942-04-20 | 0 | 1 | 0 | 0 | 1 | 1966 | https://en.wikipedia.org/wiki/Ha_Jung-won |
| 8484 | 8485 | P-03072 | Zykov | Valery | 1944-02-24 | 0 | 0 | 1 | 0 | 1 | 1970 | https://en.wikipedia.org/wiki/Valery_Zykov |
8485 rows × 12 columns
data_squads = pd.read_csv("./worldcup-1.1.0/data-csv/squads.csv")
data_squads.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10973 entries, 0 to 10972 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 10973 non-null int64 1 tournament_id 10973 non-null object 2 tournament_name 10973 non-null object 3 team_id 10973 non-null object 4 team_name 10973 non-null object 5 team_code 10973 non-null object 6 player_id 10973 non-null object 7 family_name 10973 non-null object 8 given_name 10973 non-null object 9 shirt_number 10973 non-null int64 10 position_name 10973 non-null object 11 position_code 10973 non-null object dtypes: int64(2), object(10) memory usage: 1.0+ MB
data_squads.describe()
data_squads.head()
| key_id | tournament_id | tournament_name | team_id | team_name | team_code | player_id | family_name | given_name | shirt_number | position_name | position_code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | WC-1930 | 1930 FIFA World Cup | T-03 | Argentina | ARG | P-06987 | Bossio | Ángel | 0 | goal keeper | GK |
| 1 | 2 | WC-1930 | 1930 FIFA World Cup | T-03 | Argentina | ARG | P-00287 | Botasso | Juan | 0 | goal keeper | GK |
| 2 | 3 | WC-1930 | 1930 FIFA World Cup | T-03 | Argentina | ARG | P-01435 | Cherro | Roberto | 0 | forward | FW |
| 3 | 4 | WC-1930 | 1930 FIFA World Cup | T-03 | Argentina | ARG | P-01321 | Chividini | Alberto | 0 | defender | DF |
| 4 | 5 | WC-1930 | 1930 FIFA World Cup | T-03 | Argentina | ARG | P-08552 | Della Torre | José | 0 | defender | DF |
data_squads.isnull().sum(axis=0)
key_id 0 tournament_id 0 tournament_name 0 team_id 0 team_name 0 team_code 0 player_id 0 family_name 0 given_name 0 shirt_number 0 position_name 0 position_code 0 dtype: int64
players_teams = pd.merge(data_players, data_squads,
on="player_id", how="inner", suffixes=('', '_y')).filter(regex='^(?!.*_y)')
players_teams.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10973 entries, 0 to 10972 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 10973 non-null int64 1 player_id 10973 non-null object 2 family_name 10973 non-null object 3 given_name 10973 non-null object 4 birth_date 10971 non-null object 5 goal_keeper 10973 non-null int64 6 defender 10973 non-null int64 7 midfielder 10973 non-null int64 8 forward 10973 non-null int64 9 count_tournaments 10973 non-null int64 10 list_tournaments 10973 non-null object 11 player_wikipedia_link 10973 non-null object 12 tournament_id 10973 non-null object 13 tournament_name 10973 non-null object 14 team_id 10973 non-null object 15 team_name 10973 non-null object 16 team_code 10973 non-null object 17 shirt_number 10973 non-null int64 18 position_name 10973 non-null object 19 position_code 10973 non-null object dtypes: int64(7), object(13) memory usage: 1.7+ MB
players_teams.isnull().sum(axis=0).to_frame(name='counts').query('counts > 0')
| counts | |
|---|---|
| birth_date | 2 |
players_teams['team_count'] = players_teams.groupby(
'player_id')['team_name'].transform('count')
team_players = players_teams.drop_duplicates(['player_id', 'team_name']).groupby(
'player_id')['team_name'].apply(lambda x: ' ,'.join(x)).reset_index()
team_players = team_players.rename(columns={'team_name': 'teams_names'})
players_teams = pd.merge(players_teams, team_players,
on="player_id", how="inner", suffixes=('', '_y')).filter(regex='^(?!.*_y)')
players_teams.head()
| key_id | player_id | family_name | given_name | birth_date | goal_keeper | defender | midfielder | forward | count_tournaments | ... | tournament_id | tournament_name | team_id | team_name | team_code | shirt_number | position_name | position_code | team_count | teams_names | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | P-08891 | A'Court | Alan | 1934-09-30 | 0 | 0 | 0 | 1 | 1 | ... | WC-1958 | 1958 FIFA World Cup | T-27 | England | ENG | 21 | forward | FW | 1 | England |
| 1 | 2 | P-08589 | Aaronson | Brenden | 2000-10-22 | 0 | 0 | 0 | 1 | 1 | ... | WC-2022 | 2022 FIFA World Cup | T-80 | United States | USA | 11 | forward | FW | 1 | United States |
| 2 | 3 | P-04897 | Abadzhiev | Stefan | 1934-07-03 | 0 | 0 | 0 | 1 | 1 | ... | WC-1966 | 1966 FIFA World Cup | T-10 | Bulgaria | BGR | 17 | forward | FW | 1 | Bulgaria |
| 3 | 4 | P-05556 | Abalo | Jean-Paul | 1975-06-26 | 0 | 1 | 0 | 0 | 1 | ... | WC-2006 | 2006 FIFA World Cup | T-74 | Togo | TGO | 3 | defender | DF | 1 | Togo |
| 4 | 5 | P-08163 | Abanda | Patrice | 1978-08-03 | 0 | 1 | 0 | 0 | 1 | ... | WC-1998 | 1998 FIFA World Cup | T-11 | Cameroon | CMR | 13 | defender | DF | 1 | Cameroon |
5 rows × 22 columns
code_players = players_teams.drop_duplicates(['player_id', 'team_code']).groupby(
'player_id')['team_code'].apply(lambda x: ' ,'.join(x)).reset_index()
code_players = code_players.rename(columns={'team_code': 'teams_codes'})
players_teams = pd.merge(players_teams, code_players,
on="player_id", how="inner", suffixes=('', '_y')).filter(regex='^(?!.*_y)')
players_teams.head()
| key_id | player_id | family_name | given_name | birth_date | goal_keeper | defender | midfielder | forward | count_tournaments | ... | tournament_name | team_id | team_name | team_code | shirt_number | position_name | position_code | team_count | teams_names | teams_codes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | P-08891 | A'Court | Alan | 1934-09-30 | 0 | 0 | 0 | 1 | 1 | ... | 1958 FIFA World Cup | T-27 | England | ENG | 21 | forward | FW | 1 | England | ENG |
| 1 | 2 | P-08589 | Aaronson | Brenden | 2000-10-22 | 0 | 0 | 0 | 1 | 1 | ... | 2022 FIFA World Cup | T-80 | United States | USA | 11 | forward | FW | 1 | United States | USA |
| 2 | 3 | P-04897 | Abadzhiev | Stefan | 1934-07-03 | 0 | 0 | 0 | 1 | 1 | ... | 1966 FIFA World Cup | T-10 | Bulgaria | BGR | 17 | forward | FW | 1 | Bulgaria | BGR |
| 3 | 4 | P-05556 | Abalo | Jean-Paul | 1975-06-26 | 0 | 1 | 0 | 0 | 1 | ... | 2006 FIFA World Cup | T-74 | Togo | TGO | 3 | defender | DF | 1 | Togo | TGO |
| 4 | 5 | P-08163 | Abanda | Patrice | 1978-08-03 | 0 | 1 | 0 | 0 | 1 | ... | 1998 FIFA World Cup | T-11 | Cameroon | CMR | 13 | defender | DF | 1 | Cameroon | CMR |
5 rows × 23 columns
players_teams['team_names'] = players_teams.groupby(
'player_id')['team_name'].transform(lambda x: ', '.join(x))
players_teams[["player_id", "team_count", "team_names"]]
players_teams.team_names.sort_values()
1069 Algeria
3747 Algeria
4464 Algeria
4994 Algeria
3620 Algeria
...
6625 Zaire
7202 Zaire
7137 Zaire
10018 Zaire
6083 Zaire
Name: team_names, Length: 10973, dtype: object
players_teams['team_names'] = players_teams.groupby(
'player_id')['team_name'].transform(lambda x: list(x))
players_teams['team_count'] = players_teams.groupby(
'player_id')['team_name'].transform('count')
players_teams[["player_id", "team_count"]]
players_teams
| key_id | player_id | family_name | given_name | birth_date | goal_keeper | defender | midfielder | forward | count_tournaments | ... | team_id | team_name | team_code | shirt_number | position_name | position_code | team_count | teams_names | teams_codes | team_names | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | P-08891 | A'Court | Alan | 1934-09-30 | 0 | 0 | 0 | 1 | 1 | ... | T-27 | England | ENG | 21 | forward | FW | 1 | England | ENG | England |
| 1 | 2 | P-08589 | Aaronson | Brenden | 2000-10-22 | 0 | 0 | 0 | 1 | 1 | ... | T-80 | United States | USA | 11 | forward | FW | 1 | United States | USA | United States |
| 2 | 3 | P-04897 | Abadzhiev | Stefan | 1934-07-03 | 0 | 0 | 0 | 1 | 1 | ... | T-10 | Bulgaria | BGR | 17 | forward | FW | 1 | Bulgaria | BGR | Bulgaria |
| 3 | 4 | P-05556 | Abalo | Jean-Paul | 1975-06-26 | 0 | 1 | 0 | 0 | 1 | ... | T-74 | Togo | TGO | 3 | defender | DF | 1 | Togo | TGO | Togo |
| 4 | 5 | P-08163 | Abanda | Patrice | 1978-08-03 | 0 | 1 | 0 | 0 | 1 | ... | T-11 | Cameroon | CMR | 13 | defender | DF | 1 | Cameroon | CMR | Cameroon |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10968 | 8481 | P-06718 | Żurawski | Maciej | 1976-09-12 | 0 | 0 | 0 | 1 | 2 | ... | T-55 | Poland | POL | 9 | forward | FW | 2 | Poland | POL | Poland |
| 10969 | 8482 | P-03465 | Żurkowski | Szymon | 1997-09-25 | 0 | 0 | 1 | 0 | 1 | ... | T-55 | Poland | POL | 17 | midfielder | MF | 1 | Poland | POL | Poland |
| 10970 | 8483 | P-08318 | Zusi | Graham | 1986-08-18 | 0 | 0 | 1 | 0 | 1 | ... | T-80 | United States | USA | 19 | midfielder | MF | 1 | United States | USA | United States |
| 10971 | 8484 | P-05921 | Zygmantovich | Andrei | 1962-12-02 | 0 | 0 | 1 | 0 | 1 | ... | T-70 | Soviet Union | SUN | 17 | midfielder | MF | 1 | Soviet Union | SUN | Soviet Union |
| 10972 | 8485 | P-03072 | Zykov | Valery | 1944-02-24 | 0 | 0 | 1 | 0 | 1 | ... | T-70 | Soviet Union | SUN | 10 | midfielder | MF | 1 | Soviet Union | SUN | Soviet Union |
10973 rows × 24 columns
merge_matches_attendance = final_matches_attendance
merge_matches_attendance.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 964 entries, 0 to 963 Data columns (total 42 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 964 non-null int64 1 tournament_id 964 non-null object 2 tournament_name 964 non-null object 3 match_id 964 non-null object 4 match_name 964 non-null object 5 stage_name 964 non-null object 6 group_name 964 non-null object 7 group_stage 964 non-null int64 8 knockout_stage 964 non-null int64 9 replayed 964 non-null int64 10 replay 964 non-null int64 11 match_date 964 non-null object 12 match_time 964 non-null object 13 stadium_id 964 non-null object 14 stadium_name 964 non-null object 15 city_name 964 non-null object 16 country_name 964 non-null object 17 home_team_id 964 non-null object 18 home_team_name 964 non-null object 19 home_team_code 964 non-null object 20 away_team_id 964 non-null object 21 away_team_name 964 non-null object 22 away_team_code 964 non-null object 23 score 964 non-null object 24 home_team_score 964 non-null int64 25 away_team_score 964 non-null int64 26 home_team_score_margin 964 non-null int64 27 away_team_score_margin 964 non-null int64 28 extra_time 964 non-null int64 29 penalty_shootout 964 non-null int64 30 score_penalties 964 non-null object 31 home_team_score_penalties 964 non-null int64 32 away_team_score_penalties 964 non-null int64 33 result 964 non-null object 34 home_team_win 964 non-null int64 35 away_team_win 964 non-null int64 36 draw 964 non-null int64 37 stadium_capacity 964 non-null int64 38 stadium_wikipedia_link 964 non-null object 39 city_wikipedia_link 964 non-null object 40 Attendance 964 non-null int64 41 Date 964 non-null object dtypes: int64(18), object(24) memory usage: 316.4+ KB
data_matches['total_goals_in_match'] = data_matches['home_team_score'] + data_matches['away_team_score'] + \
data_matches['home_team_score_penalties'] + \
data_matches['away_team_score_penalties']
data_matches[['match_id', 'match_name', 'total_goals_in_match']].head()
| match_id | match_name | total_goals_in_match | |
|---|---|---|---|
| 0 | M-1930-01 | France v Mexico | 5 |
| 1 | M-1930-02 | United States v Belgium | 3 |
| 2 | M-1930-03 | Yugoslavia v Brazil | 3 |
| 3 | M-1930-04 | Romania v Peru | 4 |
| 4 | M-1930-05 | Argentina v France | 1 |
data_matches['match_for_host'] = np.where(data_matches['country_name'] == data_matches['home_team_name'], True, False) | np.where(
data_matches['country_name'] == data_matches['away_team_name'], True, False)
data_matches[['match_id', 'match_name',
'country_name', 'match_for_host']].head()
| match_id | match_name | country_name | match_for_host | |
|---|---|---|---|---|
| 0 | M-1930-01 | France v Mexico | Uruguay | False |
| 1 | M-1930-02 | United States v Belgium | Uruguay | False |
| 2 | M-1930-03 | Yugoslavia v Brazil | Uruguay | False |
| 3 | M-1930-04 | Romania v Peru | Uruguay | False |
| 4 | M-1930-05 | Argentina v France | Uruguay | False |
merge_matches_attendance['used_capacity_ratio'] = round(
((merge_matches_attendance['Attendance'] / merge_matches_attendance['stadium_capacity']) * 100), 2)
data_matches['used_capacity_ratio'] = round(
((merge_matches_attendance['Attendance'] / merge_matches_attendance['stadium_capacity']) * 100), 2)
data_matches[['match_id', 'used_capacity_ratio']].head()
| match_id | used_capacity_ratio | |
|---|---|---|
| 0 | M-1930-01 | 47.22 |
| 1 | M-1930-02 | 48.57 |
| 2 | M-1930-03 | 43.08 |
| 3 | M-1930-04 | 96.15 |
| 4 | M-1930-05 | 101.82 |
transformer_attendance_category = KBinsDiscretizer(n_bins=10, encode='ordinal')
merge_matches_attendance['attendance_category'] = transformer_attendance_category.fit_transform(
merge_matches_attendance['Attendance'].values.reshape(-1, 1))
np.unique(merge_matches_attendance['attendance_category'])
array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])
transformer = KBinsDiscretizer(n_bins=5, encode='ordinal')
merge_matches_attendance['relative_attendance_category'] = transformer.fit_transform(
data_matches['used_capacity_ratio'].values.reshape(-1, 1))
np.unique(merge_matches_attendance['relative_attendance_category'])
merge_matches_attendance['used_capacity_ratio'].isnull()
0 False
1 False
2 False
3 False
4 False
...
959 False
960 False
961 False
962 False
963 False
Name: used_capacity_ratio, Length: 964, dtype: bool
data_teams = pd.read_csv("./worldcup-1.1.0/data-csv/teams.csv")
data_teams.info()
merge_matches_attendance = pd.merge(
merge_matches_attendance, data_teams[["team_name", "team_code"]], left_on=["country_name"], right_on=["team_name"], how="inner")
merge_matches_attendance["host_country_code"] = merge_matches_attendance["team_code"]
merge_matches_attendance
<class 'pandas.core.frame.DataFrame'> RangeIndex: 85 entries, 0 to 84 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 85 non-null int64 1 team_id 85 non-null object 2 team_name 85 non-null object 3 team_code 85 non-null object 4 federation_name 85 non-null object 5 region_name 85 non-null object 6 confederation_id 85 non-null object 7 confederation_name 85 non-null object 8 confederation_code 85 non-null object 9 team_wikipedia_link 85 non-null object 10 federation_wikipedia_link 85 non-null object dtypes: int64(1), object(10) memory usage: 7.4+ KB
| key_id | tournament_id | tournament_name | match_id | match_name | stage_name | group_name | group_stage | knockout_stage | replayed | ... | stadium_wikipedia_link | city_wikipedia_link | Attendance | Date | used_capacity_ratio | attendance_category | relative_attendance_category | team_name | team_code | host_country_code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 97 | WC-1954 | 1954 FIFA World Cup | M-1954-22 | West Germany v Yugoslavia | quarter-finals | not applicable | 0 | 1 | 0 | ... | https://en.wikipedia.org/wiki/Charmilles_Stadium | https://en.wikipedia.org/wiki/Geneva | 17000 | 1954-06-27 | 47.22 | 0.0 | 0.0 | Switzerland | CHE | CHE |
| 1 | 92 | WC-1954 | 1954 FIFA World Cup | M-1954-17 | West Germany v Turkey | group stage | Group 2 | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/Hardturm | https://en.wikipedia.org/wiki/Zürich | 17000 | 1954-06-23 | 48.57 | 0.0 | 0.0 | Switzerland | CHE | CHE |
| 2 | 82 | WC-1954 | 1954 FIFA World Cup | M-1954-07 | West Germany v Turkey | group stage | Group 2 | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/Wankdorf_Stadium | https://en.wikipedia.org/wiki/Bern | 28000 | 1954-06-17 | 43.08 | 2.0 | 0.0 | Switzerland | CHE | CHE |
| 3 | 101 | WC-1954 | 1954 FIFA World Cup | M-1954-26 | West Germany v Hungary | final | not applicable | 0 | 1 | 0 | ... | https://en.wikipedia.org/wiki/Wankdorf_Stadium | https://en.wikipedia.org/wiki/Bern | 62500 | 1954-07-04 | 96.15 | 7.0 | 3.0 | Switzerland | CHE | CHE |
| 4 | 88 | WC-1954 | 1954 FIFA World Cup | M-1954-13 | Hungary v West Germany | group stage | Group 2 | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/St._Jakob_Stadium | https://en.wikipedia.org/wiki/Basel | 56000 | 1954-06-20 | 101.82 | 7.0 | 4.0 | Switzerland | CHE | CHE |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 959 | 921 | WC-2022 | 2022 FIFA World Cup | M-2022-21 | Tunisia v Australia | group stage | Group D | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/Al_Janoub_Stadium | https://en.wikipedia.org/wiki/Al_Wakrah | 41823 | 2022-11-26 | 95.05 | 4.0 | 2.0 | Qatar | QAT | QAT |
| 960 | 929 | WC-2022 | 2022 FIFA World Cup | M-2022-29 | Cameroon v Serbia | group stage | Group G | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/Al_Janoub_Stadium | https://en.wikipedia.org/wiki/Al_Wakrah | 39789 | 2022-11-28 | 90.43 | 4.0 | 2.0 | Qatar | QAT | QAT |
| 961 | 937 | WC-2022 | 2022 FIFA World Cup | M-2022-37 | Australia v Denmark | group stage | Group D | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/Al_Janoub_Stadium | https://en.wikipedia.org/wiki/Al_Wakrah | 41232 | 2022-11-30 | 93.71 | 4.0 | 2.0 | Qatar | QAT | QAT |
| 962 | 945 | WC-2022 | 2022 FIFA World Cup | M-2022-45 | Ghana v Uruguay | group stage | Group H | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/Al_Janoub_Stadium | https://en.wikipedia.org/wiki/Al_Wakrah | 43443 | 2022-12-02 | 98.73 | 5.0 | 3.0 | Qatar | QAT | QAT |
| 963 | 953 | WC-2022 | 2022 FIFA World Cup | M-2022-53 | Japan v Croatia | round of 16 | not applicable | 0 | 1 | 0 | ... | https://en.wikipedia.org/wiki/Al_Janoub_Stadium | https://en.wikipedia.org/wiki/Al_Wakrah | 42523 | 2022-12-05 | 96.64 | 4.0 | 3.0 | Qatar | QAT | QAT |
964 rows × 48 columns
data_matches['tournament_year'] = data_matches['tournament_name'].str.split(
' ').str[0]
data_matches['tournament_year']
0 1930
1 1930
2 1930
3 1930
4 1930
...
959 2022
960 2022
961 2022
962 2022
963 2022
Name: tournament_year, Length: 964, dtype: object
data_players['full_name'] = data_players['given_name'] + \
' ' + data_players['family_name']
data_players['full_name']
0 Alan A'Court
1 Brenden Aaronson
2 Stefan Abadzhiev
3 Jean-Paul Abalo
4 Patrice Abanda
...
8480 Maciej Żurawski
8481 Szymon Żurkowski
8482 Graham Zusi
8483 Andrei Zygmantovich
8484 Valery Zykov
Name: full_name, Length: 8485, dtype: object
def short_stage_name(row):
a, b = row["group_stage"], row["knockout_stage"]
if a:
return "group_stage"
else:
return "knockout_stage"
data_matches[['stage_name', 'group_stage', 'knockout_stage']]
data_matches["short_stage_name"] = data_matches.apply(short_stage_name, axis=1)
data_matches.head()
| key_id | tournament_id | tournament_name | match_id | match_name | stage_name | group_name | group_stage | knockout_stage | replayed | ... | away_team_score_penalties | result | home_team_win | away_team_win | draw | total_goals_in_match | match_for_host | used_capacity_ratio | tournament_year | short_stage_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | group stage | Group 1 | 1 | 0 | 0 | ... | 0 | home team win | 1 | 0 | 0 | 5 | False | 47.22 | 1930 | group_stage |
| 1 | 2 | WC-1930 | 1930 FIFA World Cup | M-1930-02 | United States v Belgium | group stage | Group 4 | 1 | 0 | 0 | ... | 0 | home team win | 1 | 0 | 0 | 3 | False | 48.57 | 1930 | group_stage |
| 2 | 3 | WC-1930 | 1930 FIFA World Cup | M-1930-03 | Yugoslavia v Brazil | group stage | Group 2 | 1 | 0 | 0 | ... | 0 | home team win | 1 | 0 | 0 | 3 | False | 43.08 | 1930 | group_stage |
| 3 | 4 | WC-1930 | 1930 FIFA World Cup | M-1930-04 | Romania v Peru | group stage | Group 3 | 1 | 0 | 0 | ... | 0 | home team win | 1 | 0 | 0 | 4 | False | 96.15 | 1930 | group_stage |
| 4 | 5 | WC-1930 | 1930 FIFA World Cup | M-1930-05 | Argentina v France | group stage | Group 1 | 1 | 0 | 0 | ... | 0 | home team win | 1 | 0 | 0 | 1 | False | 101.82 | 1930 | group_stage |
5 rows × 42 columns
data_tournaments = pd.read_csv("./worldcup-1.1.0/data-csv/tournaments.csv")
data_tournaments.info()
data_tournaments.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 22 entries, 0 to 21 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 22 non-null int64 1 tournament_id 22 non-null object 2 tournament_name 22 non-null object 3 year 22 non-null int64 4 start_date 22 non-null object 5 end_date 22 non-null object 6 host_country 22 non-null object 7 winner 22 non-null object 8 host_won 22 non-null int64 9 count_teams 22 non-null int64 10 group_stage 22 non-null int64 11 second_group_stage 22 non-null int64 12 final_round 22 non-null int64 13 round_of_16 22 non-null int64 14 quarter_finals 22 non-null int64 15 semi_finals 22 non-null int64 16 third_place_match 22 non-null int64 17 final 22 non-null int64 dtypes: int64(12), object(6) memory usage: 3.2+ KB
| key_id | tournament_id | tournament_name | year | start_date | end_date | host_country | winner | host_won | count_teams | group_stage | second_group_stage | final_round | round_of_16 | quarter_finals | semi_finals | third_place_match | final | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | WC-1930 | 1930 FIFA World Cup | 1930 | 1930-07-13 | 1930-07-30 | Uruguay | Uruguay | 1 | 13 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 1 | 2 | WC-1934 | 1934 FIFA World Cup | 1934 | 1934-05-27 | 1934-06-10 | Italy | Italy | 1 | 16 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 |
| 2 | 3 | WC-1938 | 1938 FIFA World Cup | 1938 | 1938-06-04 | 1938-06-19 | France | Italy | 0 | 15 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 |
| 3 | 4 | WC-1950 | 1950 FIFA World Cup | 1950 | 1950-06-24 | 1950-07-16 | Brazil | Uruguay | 0 | 13 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5 | WC-1954 | 1954 FIFA World Cup | 1954 | 1954-06-16 | 1954-07-04 | Switzerland | West Germany | 0 | 16 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 |
def winner(row):
id_t = row["winner"]
first = data_squads[data_squads["team_name"]
== id_t].iloc[0]
return first["team_code"]
data_tournaments["winner_code"] = data_tournaments.apply(winner, axis=1)
data_tournaments.info()
data_tournaments.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 22 entries, 0 to 21 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 22 non-null int64 1 tournament_id 22 non-null object 2 tournament_name 22 non-null object 3 year 22 non-null int64 4 start_date 22 non-null object 5 end_date 22 non-null object 6 host_country 22 non-null object 7 winner 22 non-null object 8 host_won 22 non-null int64 9 count_teams 22 non-null int64 10 group_stage 22 non-null int64 11 second_group_stage 22 non-null int64 12 final_round 22 non-null int64 13 round_of_16 22 non-null int64 14 quarter_finals 22 non-null int64 15 semi_finals 22 non-null int64 16 third_place_match 22 non-null int64 17 final 22 non-null int64 18 winner_code 22 non-null object dtypes: int64(12), object(7) memory usage: 3.4+ KB
| key_id | tournament_id | tournament_name | year | start_date | end_date | host_country | winner | host_won | count_teams | group_stage | second_group_stage | final_round | round_of_16 | quarter_finals | semi_finals | third_place_match | final | winner_code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | WC-1930 | 1930 FIFA World Cup | 1930 | 1930-07-13 | 1930-07-30 | Uruguay | Uruguay | 1 | 13 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | URY |
| 1 | 2 | WC-1934 | 1934 FIFA World Cup | 1934 | 1934-05-27 | 1934-06-10 | Italy | Italy | 1 | 16 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | ITA |
| 2 | 3 | WC-1938 | 1938 FIFA World Cup | 1938 | 1938-06-04 | 1938-06-19 | France | Italy | 0 | 15 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | ITA |
| 3 | 4 | WC-1950 | 1950 FIFA World Cup | 1950 | 1950-06-24 | 1950-07-16 | Brazil | Uruguay | 0 | 13 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | URY |
| 4 | 5 | WC-1954 | 1954 FIFA World Cup | 1954 | 1954-06-16 | 1954-07-04 | Switzerland | West Germany | 0 | 16 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | DEU |
data_goals = pd.read_csv("./worldcup-1.1.0/data-csv/goals.csv")
data_goals.head()
| key_id | goal_id | tournament_id | tournament_name | match_id | match_name | match_date | stage_name | group_name | team_id | ... | shirt_number | player_team_id | player_team_name | player_team_code | minute_label | minute_regulation | minute_stoppage | match_period | own_goal | penalty | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | G-0001 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | 1930-07-13 | group stage | Group 1 | T-28 | ... | 0 | T-28 | France | FRA | 19' | 19 | 0 | first half | 0 | 0 |
| 1 | 2 | G-0002 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | 1930-07-13 | group stage | Group 1 | T-28 | ... | 0 | T-28 | France | FRA | 40' | 40 | 0 | first half | 0 | 0 |
| 2 | 3 | G-0003 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | 1930-07-13 | group stage | Group 1 | T-28 | ... | 0 | T-28 | France | FRA | 43' | 43 | 0 | first half | 0 | 0 |
| 3 | 4 | G-0004 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | 1930-07-13 | group stage | Group 1 | T-44 | ... | 0 | T-44 | Mexico | MEX | 70' | 70 | 0 | second half | 0 | 0 |
| 4 | 5 | G-0005 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | 1930-07-13 | group stage | Group 1 | T-28 | ... | 0 | T-28 | France | FRA | 87' | 87 | 0 | second half | 0 | 0 |
5 rows × 27 columns
def late_goal(row):
match_period, minutes = row['match_period'], row['minute_regulation']
if match_period == "first half" and minutes > 43:
return True
if match_period == "second half" and minutes > 88:
return True
return False
data_goals['late_goal'] = data_goals.apply(late_goal, axis=1)
data_goals
| key_id | goal_id | tournament_id | tournament_name | match_id | match_name | match_date | stage_name | group_name | team_id | ... | player_team_id | player_team_name | player_team_code | minute_label | minute_regulation | minute_stoppage | match_period | own_goal | penalty | late_goal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | G-0001 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | 1930-07-13 | group stage | Group 1 | T-28 | ... | T-28 | France | FRA | 19' | 19 | 0 | first half | 0 | 0 | False |
| 1 | 2 | G-0002 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | 1930-07-13 | group stage | Group 1 | T-28 | ... | T-28 | France | FRA | 40' | 40 | 0 | first half | 0 | 0 | False |
| 2 | 3 | G-0003 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | 1930-07-13 | group stage | Group 1 | T-28 | ... | T-28 | France | FRA | 43' | 43 | 0 | first half | 0 | 0 | False |
| 3 | 4 | G-0004 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | 1930-07-13 | group stage | Group 1 | T-44 | ... | T-44 | Mexico | MEX | 70' | 70 | 0 | second half | 0 | 0 | False |
| 4 | 5 | G-0005 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | 1930-07-13 | group stage | Group 1 | T-28 | ... | T-28 | France | FRA | 87' | 87 | 0 | second half | 0 | 0 | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2715 | 2716 | G-2716 | WC-2022 | 2022 FIFA World Cup | M-2022-64 | Argentina v France | 2022-12-18 | final | not applicable | T-03 | ... | T-03 | Argentina | ARG | 36' | 36 | 0 | first half | 0 | 0 | False |
| 2716 | 2717 | G-2717 | WC-2022 | 2022 FIFA World Cup | M-2022-64 | Argentina v France | 2022-12-18 | final | not applicable | T-28 | ... | T-28 | France | FRA | 80' | 80 | 0 | second half | 0 | 1 | False |
| 2717 | 2718 | G-2718 | WC-2022 | 2022 FIFA World Cup | M-2022-64 | Argentina v France | 2022-12-18 | final | not applicable | T-28 | ... | T-28 | France | FRA | 81' | 81 | 0 | second half | 0 | 0 | False |
| 2718 | 2719 | G-2719 | WC-2022 | 2022 FIFA World Cup | M-2022-64 | Argentina v France | 2022-12-18 | final | not applicable | T-03 | ... | T-03 | Argentina | ARG | 108' | 108 | 0 | extra time, second half | 0 | 0 | False |
| 2719 | 2720 | G-2720 | WC-2022 | 2022 FIFA World Cup | M-2022-64 | Argentina v France | 2022-12-18 | final | not applicable | T-28 | ... | T-28 | France | FRA | 118' | 118 | 0 | extra time, second half | 0 | 1 | False |
2720 rows × 28 columns
data_teams = pd.read_csv("./worldcup-1.1.0/data-csv/teams.csv")
data_teams.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 85 entries, 0 to 84 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 85 non-null int64 1 team_id 85 non-null object 2 team_name 85 non-null object 3 team_code 85 non-null object 4 federation_name 85 non-null object 5 region_name 85 non-null object 6 confederation_id 85 non-null object 7 confederation_name 85 non-null object 8 confederation_code 85 non-null object 9 team_wikipedia_link 85 non-null object 10 federation_wikipedia_link 85 non-null object dtypes: int64(1), object(10) memory usage: 7.4+ KB
data_tournaments['wines_time'] = data_tournaments.groupby(
'winner_code')["winner_code"].transform('count')
data_tournaments['wines_time']
0 2 1 4 2 4 3 2 4 4 5 5 6 5 7 1 8 5 9 4 10 3 11 4 12 3 13 4 14 5 15 2 16 5 17 4 18 1 19 4 20 2 21 3 Name: wines_time, dtype: int64
test = data_tournaments[['wines_time', "winner_code"]].drop_duplicates()
test
| wines_time | winner_code | |
|---|---|---|
| 0 | 2 | URY |
| 1 | 4 | ITA |
| 4 | 4 | DEU |
| 5 | 5 | BRA |
| 7 | 1 | ENG |
| 10 | 3 | ARG |
| 15 | 2 | FRA |
| 18 | 1 | ESP |
merge_squads = pd.merge(test, data_teams, left_on=[
"winner_code"], right_on=["team_code"], how="right")
merge_squads = merge_squads.drop(["winner_code"], axis=1)
merge_squads.sort_values("wines_time").head(30)
merge_squads["wines_time"] = merge_squads["wines_time"].fillna(0)
merge_squads['wines_time'] = merge_squads['wines_time'].astype(np.int64)
merge_squads.info()
merge_squads.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 85 entries, 0 to 84 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 wines_time 85 non-null int64 1 key_id 85 non-null int64 2 team_id 85 non-null object 3 team_name 85 non-null object 4 team_code 85 non-null object 5 federation_name 85 non-null object 6 region_name 85 non-null object 7 confederation_id 85 non-null object 8 confederation_name 85 non-null object 9 confederation_code 85 non-null object 10 team_wikipedia_link 85 non-null object 11 federation_wikipedia_link 85 non-null object dtypes: int64(2), object(10) memory usage: 8.1+ KB
| wines_time | key_id | team_id | team_name | team_code | federation_name | region_name | confederation_id | confederation_name | confederation_code | team_wikipedia_link | federation_wikipedia_link | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | T-01 | Algeria | DZA | Algerian Football Federation | Africa | CF-2 | Confederation of African Football | CAF | https://en.wikipedia.org/wiki/Algeria_national... | https://en.wikipedia.org/wiki/Algerian_Footbal... |
| 1 | 0 | 2 | T-02 | Angola | AGO | Angolan Football Federation | Africa | CF-2 | Confederation of African Football | CAF | https://en.wikipedia.org/wiki/Angola_national_... | https://en.wikipedia.org/wiki/Angolan_Football... |
| 2 | 3 | 3 | T-03 | Argentina | ARG | Argentine Football Association | South America | CF-4 | South American Football Confederation | CONMEBOL | https://en.wikipedia.org/wiki/Argentina_nation... | https://en.wikipedia.org/wiki/Argentine_Footba... |
| 3 | 0 | 4 | T-04 | Australia | AUS | Football Australia | Oceania | CF-1 | Asian Football Confederation | AFC | https://en.wikipedia.org/wiki/Australia_men%27... | https://en.wikipedia.org/wiki/Football_Australia |
| 4 | 0 | 5 | T-05 | Austria | AUT | Austrian Football Association | Europe | CF-6 | Union of European Football Associations | UEFA | https://en.wikipedia.org/wiki/Austria_national... | https://en.wikipedia.org/wiki/Austrian_Footbal... |
mean_attendance = merge_matches_attendance['Attendance'].mean()
median_attendance = merge_matches_attendance['Attendance'].median()
merge_matches_attendance.head(15)
| key_id | tournament_id | tournament_name | match_id | match_name | stage_name | group_name | group_stage | knockout_stage | replayed | ... | stadium_wikipedia_link | city_wikipedia_link | Attendance | Date | used_capacity_ratio | attendance_category | relative_attendance_category | team_name | team_code | host_country_code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 97 | WC-1954 | 1954 FIFA World Cup | M-1954-22 | West Germany v Yugoslavia | quarter-finals | not applicable | 0 | 1 | 0 | ... | https://en.wikipedia.org/wiki/Charmilles_Stadium | https://en.wikipedia.org/wiki/Geneva | 17000 | 1954-06-27 | 47.22 | 0.0 | 0.0 | Switzerland | CHE | CHE |
| 1 | 92 | WC-1954 | 1954 FIFA World Cup | M-1954-17 | West Germany v Turkey | group stage | Group 2 | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/Hardturm | https://en.wikipedia.org/wiki/Zürich | 17000 | 1954-06-23 | 48.57 | 0.0 | 0.0 | Switzerland | CHE | CHE |
| 2 | 82 | WC-1954 | 1954 FIFA World Cup | M-1954-07 | West Germany v Turkey | group stage | Group 2 | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/Wankdorf_Stadium | https://en.wikipedia.org/wiki/Bern | 28000 | 1954-06-17 | 43.08 | 2.0 | 0.0 | Switzerland | CHE | CHE |
| 3 | 101 | WC-1954 | 1954 FIFA World Cup | M-1954-26 | West Germany v Hungary | final | not applicable | 0 | 1 | 0 | ... | https://en.wikipedia.org/wiki/Wankdorf_Stadium | https://en.wikipedia.org/wiki/Bern | 62500 | 1954-07-04 | 96.15 | 7.0 | 3.0 | Switzerland | CHE | CHE |
| 4 | 88 | WC-1954 | 1954 FIFA World Cup | M-1954-13 | Hungary v West Germany | group stage | Group 2 | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/St._Jakob_Stadium | https://en.wikipedia.org/wiki/Basel | 56000 | 1954-06-20 | 101.82 | 7.0 | 4.0 | Switzerland | CHE | CHE |
| 5 | 99 | WC-1954 | 1954 FIFA World Cup | M-1954-24 | West Germany v Austria | semi-finals | not applicable | 0 | 1 | 0 | ... | https://en.wikipedia.org/wiki/St._Jakob_Stadium | https://en.wikipedia.org/wiki/Basel | 58000 | 1954-06-30 | 105.45 | 7.0 | 4.0 | Switzerland | CHE | CHE |
| 6 | 94 | WC-1954 | 1954 FIFA World Cup | M-1954-19 | Austria v Switzerland | quarter-finals | not applicable | 0 | 1 | 0 | ... | https://en.wikipedia.org/wiki/Stade_Olympique_... | https://en.wikipedia.org/wiki/Lausanne | 35000 | 1954-06-26 | 70.00 | 3.0 | 1.0 | Switzerland | CHE | CHE |
| 7 | 91 | WC-1954 | 1954 FIFA World Cup | M-1954-16 | England v Switzerland | group stage | Group 4 | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/Wankdorf_Stadium | https://en.wikipedia.org/wiki/Bern | 43500 | 1954-06-20 | 66.92 | 5.0 | 1.0 | Switzerland | CHE | CHE |
| 8 | 76 | WC-1954 | 1954 FIFA World Cup | M-1954-01 | Brazil v Mexico | group stage | Group 1 | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/Charmilles_Stadium | https://en.wikipedia.org/wiki/Geneva | 13470 | 1954-06-16 | 37.42 | 0.0 | 0.0 | Switzerland | CHE | CHE |
| 9 | 87 | WC-1954 | 1954 FIFA World Cup | M-1954-12 | France v Mexico | group stage | Group 1 | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/Charmilles_Stadium | https://en.wikipedia.org/wiki/Geneva | 19000 | 1954-06-19 | 52.78 | 1.0 | 0.0 | Switzerland | CHE | CHE |
| 10 | 89 | WC-1954 | 1954 FIFA World Cup | M-1954-14 | Turkey v South Korea | group stage | Group 2 | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/Charmilles_Stadium | https://en.wikipedia.org/wiki/Geneva | 4000 | 1954-06-20 | 11.11 | 0.0 | 0.0 | Switzerland | CHE | CHE |
| 11 | 77 | WC-1954 | 1954 FIFA World Cup | M-1954-02 | Yugoslavia v France | group stage | Group 1 | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/Stade_Olympique_... | https://en.wikipedia.org/wiki/Lausanne | 16000 | 1954-06-16 | 32.00 | 0.0 | 0.0 | Switzerland | CHE | CHE |
| 12 | 80 | WC-1954 | 1954 FIFA World Cup | M-1954-05 | Switzerland v Italy | group stage | Group 4 | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/Stade_Olympique_... | https://en.wikipedia.org/wiki/Lausanne | 43000 | 1954-06-17 | 86.00 | 5.0 | 1.0 | Switzerland | CHE | CHE |
| 13 | 85 | WC-1954 | 1954 FIFA World Cup | M-1954-10 | Brazil v Yugoslavia | group stage | Group 1 | 1 | 0 | 0 | ... | https://en.wikipedia.org/wiki/Stade_Olympique_... | https://en.wikipedia.org/wiki/Lausanne | 24637 | 1954-06-19 | 49.27 | 1.0 | 0.0 | Switzerland | CHE | CHE |
| 14 | 98 | WC-1954 | 1954 FIFA World Cup | M-1954-23 | Hungary v Uruguay | semi-finals | not applicable | 0 | 1 | 0 | ... | https://en.wikipedia.org/wiki/Stade_Olympique_... | https://en.wikipedia.org/wiki/Lausanne | 45000 | 1954-06-30 | 90.00 | 5.0 | 2.0 | Switzerland | CHE | CHE |
15 rows × 48 columns
mean = merge_matches_attendance['Attendance'].mean()
median = merge_matches_attendance['Attendance'].median()
trace = go.Scatter(x=list(range(1, len(
merge_matches_attendance['Attendance'])+1)), y=merge_matches_attendance['Attendance'], mode='lines', name='Data')
mean_trace = go.Scatter(x=[1, len(merge_matches_attendance['Attendance'])], y=[
mean, mean], mode='lines', name='Mean')
median_trace = go.Scatter(x=[1, len(merge_matches_attendance['Attendance'])], y=[
median, median], mode='lines', name='Median')
fig = go.Figure(data=[trace, mean_trace, median_trace])
fig.update_layout(title='Attendance with Mean & Median',
xaxis_title='Attendance',
yaxis_title='Mean & Median')
fig.show()
trace = go.Histogram(x=merge_matches_attendance['Attendance'], nbinsx=100)
# create the figure
fig = go.Figure(data=[trace])
# add title and axis labels
fig.update_layout(title='Histogram with Custom Bins',
xaxis_title='Attendance',
yaxis_title=' ')
trace = go.Box(x=merge_matches_attendance['tournament_id'],
y=merge_matches_attendance["Attendance"], name='Box Plot')
fig = go.Figure(data=[trace])
fig.update_layout(title='Box Plot Example',
yaxis_title='Value')
data_goals['goal_period'] = data_goals['minute_regulation'] + \
data_goals['minute_stoppage']
grouped_data = data_goals.groupby(['tournament_id'])['goal_period'].sum()
mean = grouped_data.mean()
trace = go.Bar(x=list(range(1, len(grouped_data)+1)),
y=grouped_data, name='Data')
mean_trace = go.Scatter(x=[1, len(grouped_data)], y=[
mean, mean], mode='lines', name='Mean')
fig = go.Figure(data=[trace, mean_trace])
fig.update_layout(title='Bar Plot with Mean and Median',
xaxis_title=' ', yaxis_title='')
fig.show()
trace_total_goals = go.Histogram(x=data_matches['total_goals_in_match'])
fig = go.Figure(data=[trace_total_goals])
fig.update_layout(title='Histogram with Custom Bins',
xaxis_title='Total Goals',
yaxis_title=' ')
most_repeatable = data_goals.groupby('tournament_id')[
'goal_period'].value_counts()
most_repeatable
tournament_id goal_period
WC-1930 65 4
89 3
67 3
12 3
20 3
..
WC-2022 20 1
16 1
79 1
77 1
118 1
Name: count, Length: 1525, dtype: int64
trace = go.Histogram(x=data_goals['late_goal'])
fig = go.Figure(data=[trace])
fig.update_layout(title='Late Goals Histogram',
xaxis_title='Value',
yaxis_title='Count')
fig.show()
players = data_goals.groupby(['player_id'])['tournament_name'].value_counts()
players = players.sort_values(ascending=False)
players = players.head(12)
print(players)
data_players_bar = [go.Bar(x=players, y=players)]
layout = go.Layout(title='Bar Plot Example', xaxis=dict(
title='Player ID'), yaxis=dict(title='Tournament Name'))
fig = go.Figure(data=data_players_bar, layout=layout)
pyo.iplot(fig)
player_id tournament_name P-02537 1958 FIFA World Cup 13 P-06982 1954 FIFA World Cup 11 P-02173 1970 FIFA World Cup 10 P-04623 1966 FIFA World Cup 9 P-03149 1950 FIFA World Cup 9 P-08490 2002 FIFA World Cup 8 P-06978 2022 FIFA World Cup 8 P-01708 1930 FIFA World Cup 8 P-02554 1974 FIFA World Cup 7 P-02781 1938 FIFA World Cup 7 P-03429 2022 FIFA World Cup 7 P-02071 1970 FIFA World Cup 7 Name: count, dtype: int64
player = data_goals.groupby(['player_id', 'tournament_id'])[
'tournament_name'].value_counts()
player = player.sort_values(ascending=False)
player = player.head(12)
print(player)
data_player = [go.Bar(x=player, y=player)]
fig = go.Figure(data=data_player)
pyo.iplot(fig)
player_id tournament_id tournament_name P-02537 WC-1958 1958 FIFA World Cup 13 P-06982 WC-1954 1954 FIFA World Cup 11 P-02173 WC-1970 1970 FIFA World Cup 10 P-03149 WC-1950 1950 FIFA World Cup 9 P-04623 WC-1966 1966 FIFA World Cup 9 P-01708 WC-1930 1930 FIFA World Cup 8 P-08490 WC-2002 2002 FIFA World Cup 8 P-06978 WC-2022 2022 FIFA World Cup 8 P-02071 WC-1970 1970 FIFA World Cup 7 P-03429 WC-2022 2022 FIFA World Cup 7 P-02554 WC-1974 1974 FIFA World Cup 7 P-02781 WC-1938 1938 FIFA World Cup 7 Name: count, dtype: int64
goals_count = data_goals.groupby(['tournament_id']).count()
goals_count = goals_count.reset_index()
data_goals_count = [
go.Bar(x=goals_count['tournament_id'], y=goals_count['key_id'])]
fig = go.Figure(data=data_goals_count)
pyo.iplot(fig)
filtered_data = data_goals[data_goals['team_name'] == ('Germany' or 'West Germany' or 'East Germany' or 'Italy' or 'Brazil')]
data_layout = [go.Scatter(x=filtered_data['goal_period'], y=filtered_data['stage_name'], mode='markers')]
layout_stipe = go.Layout(title='Strip Plot Example', xaxis=dict(title='Minutes'), yaxis=dict(title='Stage'))
fig = go.Figure(data=data_layout, layout=layout_stipe)
pyo.iplot(fig)
data_matches["combine_matches"] = list(
zip(data_matches['away_team_name'], data_matches["home_team_name"]))
freq_matches = data_matches['combine_matches'].apply(
lambda x: ','.join(tuple(sorted(x)))).value_counts()
freq_matches
combine_matches
Brazil,Sweden 7
Argentina,Netherlands 6
Argentina,Italy 5
West Germany,Yugoslavia 5
Brazil,Czechoslovakia 5
..
Argentina,Yugoslavia 1
Republic of Ireland,Romania 1
Costa Rica,Czechoslovakia 1
Cameroon,Colombia 1
France,Morocco 1
Name: count, Length: 649, dtype: int64
trace = go.Bar(x=[pair for pair in freq_matches.head(10).index],
y=freq_matches.head(10).values,
marker_color='royalblue')
layout = go.Layout(title='Top 10 Most Frequent Tuples',
xaxis_title='matches',
yaxis_title='Frequency')
fig = go.Figure(data=[trace], layout=layout)
fig.show()
data_squads["full_name"] = data_squads["given_name"] + \
" " + data_squads["family_name"]
players = data_squads[["player_id", "full_name", "team_name"]].drop_duplicates().groupby(
["player_id", "full_name"])
players2 = players.size().reset_index(name="team_represent_count").sort_values(
"team_represent_count", ascending=False)
players2
| player_id | full_name | team_represent_count | |
|---|---|---|---|
| 2016 | P-02369 | Dejan Stanković | 3 |
| 6602 | P-07779 | Karl-Heinz Riedle | 2 |
| 1298 | P-01512 | José Altafini | 2 |
| 4580 | P-05394 | José Santamaría | 2 |
| 5663 | P-06666 | Sergei Gorlukovich | 2 |
| ... | ... | ... | ... |
| 2832 | P-03333 | Antonio Di Gennaro | 1 |
| 2831 | P-03331 | Vahid Halilhodžić | 1 |
| 2830 | P-03330 | Dietmar Kühbauer | 1 |
| 2829 | P-03329 | Ciriaco Sforza | 1 |
| 8484 | P-09999 | Giorgian De Arrascaeta | 1 |
8485 rows × 3 columns
trace = go.Bar(x=[pair for pair in players2.head(10).full_name],
y=players2.head(10).team_represent_count,
marker_color='royalblue')
layout = go.Layout(title='players count teams',
xaxis_title='players',
yaxis_title='teams number')
fig = go.Figure(data=[trace], layout=layout)
fig.show()
الاسباب وراء تغير اللاعب عدة فرق وذلك بسبب فشله في فوز كأس العالم مع الفريق الذي انتمى اليه والمحاولة مع فرق اخرى
def chi_sq_test(cont_table):
chi2, p, dof, con_table = scp.stats.chi2_contingency(cont_table)
print(f'chi-squared = {chi2}\np value= {p}\ndegrees of freedom = {dof}')
def cramers_v(cross_tabs):
# getting the chi sq. stat
chi2 = scp.stats.chi2_contingency(cross_tabs)[0]
# calculating the total number of observations
n = cross_tabs.sum().sum()
# getting the degrees of freedom
dof = min(cross_tabs.shape)-1
# calculating cramer's v
v = np.sqrt(chi2/(n*dof))
# printing results
print(f'V = {v}')
print(f'Cramer\'s V Degrees of Freedom = {dof}')
# print(f'\nEffect Size Thresholds\n{sizes}\n') # print(f'\nEffect Size Thresholds\n{sizes}\n') # print(f'\nEffect Size Thresholds\n{sizes}\n')
cross_tabs = pd.crosstab(index=data_tournaments['host_country'],
columns=data_tournaments['winner'])
chi_sq_test(cross_tabs)
cramers_v(cross_tabs)
chi-squared = 138.96666666666664 p value= 0.4133433398768274 degrees of freedom = 136 V = 0.8885850175044215 Cramer's V Degrees of Freedom = 8
يوجد ارتباط قوي بين كون الدولة المضيفة هي الدولة التي سوف تربح في نهائيات كأس العالم ولكن الارتباط غير واضح كتوزع احصائي
dd = pd.merge(data_matches, merge_matches_attendance, on="match_id",
suffixes=('', '_y')).filter(regex='^(?!.*_y)')
cross_tabs = pd.crosstab(index=dd['match_for_host'],
columns=dd['relative_attendance_category'])
chi_sq_test(cross_tabs)
cramers_v(cross_tabs)
chi-squared = 14.595888271509283 p value= 0.005617119190134524 degrees of freedom = 4 V = 0.12304862021676755 Cramer's V Degrees of Freedom = 1
يوجد ارتباط قوي بين كون الدولة المضيفة هي التي تلعب مع نسبة فئة الحضور والارتباط واضح كتوزع احصائي وذلك بسبب كون اهل البلد يحضرون لعب بلدهم اكثر من اللعب الاخرى
cross_tabs = pd.crosstab(index=merge_matches_attendance['attendance_category'],
columns=merge_matches_attendance['country_name'])
chi_sq_test(cross_tabs)
cramers_v(cross_tabs)
chi-squared = 928.864659513315 p value= 2.4818485116997206e-111 degrees of freedom = 153 V = 0.32720237559623216 Cramer's V Degrees of Freedom = 9
لا يوجد ارتباط قوي بين كون الدولة المضيفة و فئة الحضور و لا يوجدارتباط كتوزع احصائي
def get_result_of_match(row):
if row['home_team_score'] >= row['away_team_score']:
row["team_id"] = row["home_team_id"]
row["team_name"] = row["home_team_name"]
row["team_score"] = row["home_team_score"]
return row[['team_id', 'team_name', 'team_score']]
if row['home_team_score'] < row['away_team_score']:
row["team_id"] = row["away_team_id"]
row["team_name"] = row["away_team_name"]
row["team_score"] = row["away_team_score"]
return row[['team_id', 'team_name', 'team_score']]
return None
final_stages_data_matches = data_matches[(data_matches['stage_name'] ==
'final') | (data_matches['stage_name'] ==
'semi-finals')]
final_result = final_stages_data_matches.apply(get_result_of_match, axis=1)
final_result
| team_id | team_name | team_score | |
|---|---|---|---|
| 15 | T-03 | Argentina | 6 |
| 16 | T-81 | Uruguay | 6 |
| 17 | T-81 | Uruguay | 4 |
| 31 | T-20 | Czechoslovakia | 3 |
| 32 | T-39 | Italy | 1 |
| 34 | T-39 | Italy | 2 |
| 49 | T-34 | Hungary | 5 |
| 50 | T-39 | Italy | 2 |
| 52 | T-39 | Italy | 4 |
| 97 | T-34 | Hungary | 4 |
| 98 | T-83 | West Germany | 6 |
| 100 | T-83 | West Germany | 3 |
| 132 | T-09 | Brazil | 5 |
| 133 | T-72 | Sweden | 3 |
| 135 | T-09 | Brazil | 5 |
| 164 | T-09 | Brazil | 4 |
| 165 | T-20 | Czechoslovakia | 3 |
| 167 | T-09 | Brazil | 3 |
| 196 | T-83 | West Germany | 2 |
| 197 | T-27 | England | 2 |
| 199 | T-27 | England | 4 |
| 228 | T-09 | Brazil | 3 |
| 229 | T-39 | Italy | 4 |
| 231 | T-09 | Brazil | 4 |
| 269 | T-83 | West Germany | 2 |
| 307 | T-03 | Argentina | 3 |
| 356 | T-39 | Italy | 2 |
| 357 | T-83 | West Germany | 3 |
| 359 | T-39 | Italy | 3 |
| 408 | T-83 | West Germany | 2 |
| 409 | T-03 | Argentina | 2 |
| 411 | T-03 | Argentina | 3 |
| 460 | T-03 | Argentina | 1 |
| 461 | T-83 | West Germany | 1 |
| 463 | T-83 | West Germany | 1 |
| 512 | T-39 | Italy | 2 |
| 513 | T-09 | Brazil | 1 |
| 515 | T-09 | Brazil | 0 |
| 576 | T-09 | Brazil | 1 |
| 577 | T-28 | France | 2 |
| 579 | T-28 | France | 3 |
| 640 | T-29 | Germany | 1 |
| 641 | T-09 | Brazil | 1 |
| 643 | T-09 | Brazil | 2 |
| 704 | T-39 | Italy | 2 |
| 705 | T-28 | France | 1 |
| 707 | T-39 | Italy | 1 |
| 768 | T-46 | Netherlands | 3 |
| 769 | T-71 | Spain | 1 |
| 771 | T-71 | Spain | 1 |
| 832 | T-29 | Germany | 7 |
| 833 | T-46 | Netherlands | 0 |
| 835 | T-29 | Germany | 1 |
| 896 | T-28 | France | 1 |
| 897 | T-17 | Croatia | 2 |
| 899 | T-28 | France | 4 |
| 960 | T-03 | Argentina | 3 |
| 961 | T-28 | France | 2 |
| 963 | T-03 | Argentina | 3 |
final_result.info()
<class 'pandas.core.frame.DataFrame'> Index: 59 entries, 15 to 963 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 team_id 59 non-null object 1 team_name 59 non-null object 2 team_score 59 non-null int64 dtypes: int64(1), object(2) memory usage: 1.8+ KB
final_result.sort_values("team_score", ascending=False)
| team_id | team_name | team_score | |
|---|---|---|---|
| 832 | T-29 | Germany | 7 |
| 15 | T-03 | Argentina | 6 |
| 98 | T-83 | West Germany | 6 |
| 16 | T-81 | Uruguay | 6 |
| 49 | T-34 | Hungary | 5 |
| 135 | T-09 | Brazil | 5 |
| 132 | T-09 | Brazil | 5 |
| 199 | T-27 | England | 4 |
| 164 | T-09 | Brazil | 4 |
| 229 | T-39 | Italy | 4 |
| 231 | T-09 | Brazil | 4 |
| 97 | T-34 | Hungary | 4 |
| 52 | T-39 | Italy | 4 |
| 899 | T-28 | France | 4 |
| 17 | T-81 | Uruguay | 4 |
| 411 | T-03 | Argentina | 3 |
| 768 | T-46 | Netherlands | 3 |
| 960 | T-03 | Argentina | 3 |
| 359 | T-39 | Italy | 3 |
| 357 | T-83 | West Germany | 3 |
| 307 | T-03 | Argentina | 3 |
| 579 | T-28 | France | 3 |
| 963 | T-03 | Argentina | 3 |
| 228 | T-09 | Brazil | 3 |
| 31 | T-20 | Czechoslovakia | 3 |
| 167 | T-09 | Brazil | 3 |
| 165 | T-20 | Czechoslovakia | 3 |
| 133 | T-72 | Sweden | 3 |
| 100 | T-83 | West Germany | 3 |
| 577 | T-28 | France | 2 |
| 961 | T-28 | France | 2 |
| 897 | T-17 | Croatia | 2 |
| 34 | T-39 | Italy | 2 |
| 50 | T-39 | Italy | 2 |
| 704 | T-39 | Italy | 2 |
| 643 | T-09 | Brazil | 2 |
| 269 | T-83 | West Germany | 2 |
| 408 | T-83 | West Germany | 2 |
| 197 | T-27 | England | 2 |
| 512 | T-39 | Italy | 2 |
| 356 | T-39 | Italy | 2 |
| 196 | T-83 | West Germany | 2 |
| 409 | T-03 | Argentina | 2 |
| 576 | T-09 | Brazil | 1 |
| 771 | T-71 | Spain | 1 |
| 32 | T-39 | Italy | 1 |
| 896 | T-28 | France | 1 |
| 835 | T-29 | Germany | 1 |
| 769 | T-71 | Spain | 1 |
| 460 | T-03 | Argentina | 1 |
| 707 | T-39 | Italy | 1 |
| 705 | T-28 | France | 1 |
| 461 | T-83 | West Germany | 1 |
| 463 | T-83 | West Germany | 1 |
| 641 | T-09 | Brazil | 1 |
| 513 | T-09 | Brazil | 1 |
| 640 | T-29 | Germany | 1 |
| 515 | T-09 | Brazil | 0 |
| 833 | T-46 | Netherlands | 0 |
final_result.team_name.value_counts()
team_name Brazil 11 Italy 10 West Germany 8 Argentina 7 France 6 Germany 3 Uruguay 2 Czechoslovakia 2 Hungary 2 England 2 Netherlands 2 Spain 2 Sweden 1 Croatia 1 Name: count, dtype: int64
team_score = final_result.groupby(["team_name"]).sum(
"team_score").reset_index("team_name")
count_team = final_result.groupby(
["team_name"]).count().reset_index("team_name")
count_team["count"] = count_team["team_score"]
count_team[["team_name", "count"]]
| team_name | count | |
|---|---|---|
| 0 | Argentina | 7 |
| 1 | Brazil | 11 |
| 2 | Croatia | 1 |
| 3 | Czechoslovakia | 2 |
| 4 | England | 2 |
| 5 | France | 6 |
| 6 | Germany | 3 |
| 7 | Hungary | 2 |
| 8 | Italy | 10 |
| 9 | Netherlands | 2 |
| 10 | Spain | 2 |
| 11 | Sweden | 1 |
| 12 | Uruguay | 2 |
| 13 | West Germany | 8 |
team_score.sort_values("team_name")
| team_name | team_score | |
|---|---|---|
| 0 | Argentina | 21 |
| 1 | Brazil | 29 |
| 2 | Croatia | 2 |
| 3 | Czechoslovakia | 6 |
| 4 | England | 6 |
| 5 | France | 13 |
| 6 | Germany | 9 |
| 7 | Hungary | 9 |
| 8 | Italy | 23 |
| 9 | Netherlands | 3 |
| 10 | Spain | 2 |
| 11 | Sweden | 3 |
| 12 | Uruguay | 10 |
| 13 | West Germany | 20 |
merge_team = pd.merge(
team_score, count_team[["team_name", "count"]], on="team_name")
merge_team["ratio"] = merge_team["team_score"]/merge_team["count"]
merge_team.sort_values("ratio", ascending=False)
| team_name | team_score | count | ratio | |
|---|---|---|---|---|
| 12 | Uruguay | 10 | 2 | 5.000000 |
| 7 | Hungary | 9 | 2 | 4.500000 |
| 0 | Argentina | 21 | 7 | 3.000000 |
| 3 | Czechoslovakia | 6 | 2 | 3.000000 |
| 4 | England | 6 | 2 | 3.000000 |
| 6 | Germany | 9 | 3 | 3.000000 |
| 11 | Sweden | 3 | 1 | 3.000000 |
| 1 | Brazil | 29 | 11 | 2.636364 |
| 13 | West Germany | 20 | 8 | 2.500000 |
| 8 | Italy | 23 | 10 | 2.300000 |
| 5 | France | 13 | 6 | 2.166667 |
| 2 | Croatia | 2 | 1 | 2.000000 |
| 9 | Netherlands | 3 | 2 | 1.500000 |
| 10 | Spain | 2 | 2 | 1.000000 |
cross_tabs = pd.crosstab(index=merge_team['count'],
columns=merge_team['team_score'])
chi_sq_test(cross_tabs)
cramers_v(cross_tabs)
chi-squared = 80.50000000000001 p value= 0.06778471571508053 degrees of freedom = 63 V = 0.9063269671749659 Cramer's V Degrees of Freedom = 7
ان الفرق التي وصلت للنصف النهائي والنهائي وفازت فان من العقول وجود رابط قوي بين عدد المرات التي فازت بها وعدد الاهداف التي حففتها